update source to 12.2.11

author Alwin Antreich <a.antreich@proxmox.com>

Wed, 6 Feb 2019 08:29:01 +0000 (09:29 +0100)

committer Alwin Antreich <a.antreich@proxmox.com>

Wed, 6 Feb 2019 09:34:02 +0000 (10:34 +0100)
author Alwin Antreich <a.antreich@proxmox.com>
Wed, 6 Feb 2019 08:29:01 +0000 (09:29 +0100)
committer Alwin Antreich <a.antreich@proxmox.com>
Wed, 6 Feb 2019 09:34:02 +0000 (10:34 +0100)
diff --git a/Makefile b/Makefile

index 8bb42e96fd709385b0b456ea21ec67ffb4d46997..505198c63bf2f188a07727b3991b63ac4db245de 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-RELEASE=5.2
+RELEASE=5.3
  
  PACKAGE=ceph
-VER=12.2.10
+VER=12.2.11
  DEBREL=pve1
  
  SRCDIR=ceph
diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt

index 35c19393691e87075019709e717e8d56d8224da0..5403de8f4ce5ea18a91aaec6a5dce1613dd23dd1 100644 (file)
--- a/ceph/CMakeLists.txt
+++ b/ceph/CMakeLists.txt
@@ -1,7 +1,7 @@
  cmake_minimum_required(VERSION 2.8.11)
  
  project(ceph)
-set(VERSION 12.2.10)
+set(VERSION 12.2.11)
  
  if(POLICY CMP0046)
    # Tweak policies (this one disables "missing" dependency warning)
@@ -367,6 +367,8 @@ endif()
  option(WITH_RADOSGW "Rados Gateway is enabled" ON)
  option(WITH_RADOSGW_FCGI_FRONTEND "Rados Gateway's FCGI frontend is enabled" OFF)
  option(WITH_RADOSGW_BEAST_FRONTEND "Rados Gateway's Beast frontend is enabled" ON)
+option(WITH_RADOSGW_BEAST_OPENSSL "Rados Gateway's Beast frontend uses OpenSSL" ON)
+
  if(WITH_RADOSGW)
    find_package(EXPAT REQUIRED)
    if(WITH_RADOSGW_FCGI_FRONTEND)
@@ -376,14 +378,7 @@ if(WITH_RADOSGW)
      message(WARNING "disabling WITH_RADOSGW_BEAST_FRONTEND, which depends on WITH_BOOST_CONTEXT")
      set(WITH_RADOSGW_BEAST_FRONTEND OFF)
    endif()
-endif(WITH_RADOSGW)
-
  
-if (WITH_RADOSGW)
-  if (NOT DEFINED OPENSSL_FOUND)
-    message(STATUS "Looking for openssl anyways, because radosgw selected")
-    find_package(OpenSSL)
-  endif()
  # https://curl.haxx.se/docs/install.html mentions the
  # configure flags for various ssl backends
    execute_process(
@@ -396,7 +391,13 @@ if (WITH_RADOSGW)
    if (CURL_CONFIG_ERRORS)
      message(WARNING "unable to run curl-config; rgw cannot make ssl requests to external systems reliably")
    endif()
-  find_package(OpenSSL)
+
+  if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+    find_package(OpenSSL REQUIRED)
+  else()
+    find_package(OpenSSL)
+  endif()
+
    if (OPENSSL_FOUND)
      if (NOT NO_CURL_SSL_LINK)
        message(STATUS "libcurl is linked with openssl: explicitly setting locks")
diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes

index 00ee957e046bd19a496684cd74f6a24e6e743fe5..b75c79fb115a3178d29c3b3321ce0c1513eeda0a 100644 (file)
--- a/ceph/PendingReleaseNotes
+++ b/ceph/PendingReleaseNotes
@@ -1,3 +1,13 @@
+>= 12.2.11
+----------
+* `cephfs-journal-tool` makes rank argument (--rank) mandatory. Rank is
+  of format `filesystem:rank`, where `filesystem` is the cephfs filesystem
+  and `rank` is the MDS rank on which the operation is to be executed. To
+  operate on all ranks, use `all` or `*` as the rank specifier. Note that,
+  operations that dump journal information to file will now dump to per-rank
+  suffixed dump files. Importing journal information from dump files is
+  disallowed if operation is targetted for all ranks.
+
  >= 12.1.2
  ---------
  * When running 'df' on a CephFS filesystem comprising exactly one data pool,
@@ -122,3 +132,26 @@
    a clean upgrade path is added to the pg log hard limit patches.
  
    See also: http://tracker.ceph.com/issues/36686
+
+12.2.11
+-------
+
+* The default memory utilization for the mons has been increased
+  somewhat.  Rocksdb now uses 512 MB of RAM by default, which should
+  be sufficient for small to medium-sized clusters; large clusters
+  should tune this up.  Also, the ``mon_osd_cache_size`` has been
+  increase from 10 OSDMaps to 500, which will translate to an
+  additional 500 MB to 1 GB of RAM for large clusters, and much less
+  for small clusters.
+
+* New CephFS file system attributes session_timeout and session_autoclose
+  are configurable via `ceph fs set`. The MDS config options
+  mds_session_timeout, mds_session_autoclose, and mds_max_file_size are now
+  obsolete.
+
+* This release fixes the pg log hard limit bug(https://tracker.ceph.com/issues/23979).
+  A flag called pglog_hardlimit has been introduced. It is off by default.
+  This flag enables the feature that limits the length of the pg log. Users should run
+  'ceph osd set pglog_hardlimit' after completely upgrading to 12.2.11. Once all the OSDs
+  have this flag set, the length of the pg log will be capped by a hard limit. We do not
+  recommend unsetting this flag beyond this point.
diff --git a/ceph/alpine/APKBUILD b/ceph/alpine/APKBUILD

index 26f824c7ad3cf71ec094427e74383a907add5e5f..220346e45b027c69db7e4ad94acf9aa0a8432107 100644 (file)
--- a/ceph/alpine/APKBUILD
+++ b/ceph/alpine/APKBUILD
@@ -1,7 +1,7 @@
  # Contributor: John Coyle <dx9err@gmail.com>
  # Maintainer: John Coyle <dx9err@gmail.com>
  pkgname=ceph
-pkgver=12.2.10
+pkgver=12.2.11
  pkgrel=0
  pkgdesc="Ceph is a distributed object store and file system"
  pkgusers="ceph"
@@ -63,7 +63,7 @@ makedepends="
         xmlstarlet
         yasm
  "
-source="ceph-12.2.10.tar.bz2"
+source="ceph-12.2.11.tar.bz2"
  subpackages="
         $pkgname-base
         $pkgname-common
@@ -116,7 +116,7 @@ _sysconfdir=/etc
  _udevrulesdir=/etc/udev/rules.d
  _python_sitelib=/usr/lib/python2.7/site-packages
  
-builddir=$srcdir/ceph-12.2.10
+builddir=$srcdir/ceph-12.2.11
  
  build() {
         export CEPH_BUILD_VIRTUALENV=$builddir
diff --git a/ceph/ceph.spec b/ceph/ceph.spec

index 94d44b690fbdfe5a79baef89c55ddb27bea31ebf..d1020673849d670e40070f55745d33f5975f8c69 100644 (file)
--- a/ceph/ceph.spec
+++ b/ceph/ceph.spec
@@ -61,7 +61,7 @@
  # main package definition
  #################################################################################
  Name:          ceph
-Version:       12.2.10
+Version:       12.2.11
  Release:       0%{?dist}
  %if 0%{?fedora} || 0%{?rhel}
  Epoch:         2
@@ -77,7 +77,7 @@ License:      LGPL-2.1 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-3-Clause and
  Group:         System/Filesystems
  %endif
  URL:           http://ceph.com/
-Source0:       http://ceph.com/download/ceph-12.2.10.tar.bz2
+Source0:       http://ceph.com/download/ceph-12.2.11.tar.bz2
  %if 0%{?suse_version}
  %if 0%{?is_opensuse}
  ExclusiveArch:  x86_64 aarch64 ppc64 ppc64le
@@ -788,7 +788,7 @@ python-rbd, python-rgw or python-cephfs instead.
  # common
  #################################################################################
  %prep
-%autosetup -p1 -n ceph-12.2.10
+%autosetup -p1 -n ceph-12.2.11
  
  %build
  %if 0%{with cephfs_java}
@@ -806,6 +806,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
  export CPPFLAGS="$java_inc"
  export CFLAGS="$RPM_OPT_FLAGS"
  export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
  
  env | sort
  
diff --git a/ceph/ceph.spec.in b/ceph/ceph.spec.in

index d708aea337a01658ed3b7467d7a5240ca16b9a21..fa34ade2dd11b1243df5d762b7c340ca3be90920 100644 (file)
--- a/ceph/ceph.spec.in
+++ b/ceph/ceph.spec.in
@@ -806,6 +806,7 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
  export CPPFLAGS="$java_inc"
  export CFLAGS="$RPM_OPT_FLAGS"
  export CXXFLAGS="$RPM_OPT_FLAGS"
+export LDFLAGS="$RPM_LD_FLAGS"
  
  env | sort
  
diff --git a/ceph/debian/changelog b/ceph/debian/changelog

index 2b61ec8825e1191ec733d7231c4fcd1659634f66..eaed6bfbb438e3a9b5128cec2840ece74f604d6a 100644 (file)
--- a/ceph/debian/changelog
+++ b/ceph/debian/changelog
@@ -1,3 +1,9 @@
+ceph (12.2.11-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Wed, 30 Jan 2019 15:51:24 +0000
+
  ceph (12.2.10-1) stable; urgency=medium
  
    * New upstream release
diff --git a/ceph/debian/control b/ceph/debian/control

index 6d01e31154a759768045bc69fdea0d52926a5212..65c29ed0d532fa1e5b4a9749cda20f91f543cc41 100644 (file)
--- a/ceph/debian/control
+++ b/ceph/debian/control
@@ -402,11 +402,13 @@ Replaces: ceph (<< 10),
            ceph-test (<< 9.0.3-1646),
            librbd1 (<< 0.92-1238),
            python-ceph (<< 0.92-1223),
+         radosgw (<< 12.0.3)
  Breaks: ceph (<< 10),
          ceph-fs-common (<< 11.0),
          ceph-test (<< 9.0.3-1646),
          librbd1 (<< 0.92-1238),
          python-ceph (<< 0.92-1223),
+       radosgw (<< 12.0.3)
  Suggests: ceph-base (= ${binary:Version}),
            ceph-mds (= ${binary:Version}),
  Description: common utilities to mount and interact with a ceph storage cluster
diff --git a/ceph/doc/README.md b/ceph/doc/README.md

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/doc/_ext/edit_on_github.py b/ceph/doc/_ext/edit_on_github.py

new file mode 100644 (file)

index 0000000..290f4b4
--- /dev/null
+++ b/ceph/doc/_ext/edit_on_github.py
@@ -0,0 +1,43 @@
+"""
+Adapted from https://gist.github.com/mgedmin/6052926
+
+Sphinx extension to add ReadTheDocs-style "Edit on GitHub" links to the
+sidebar.
+
+Loosely based on https://github.com/astropy/astropy/pull/347
+"""
+
+import os
+import warnings
+
+
+__licence__ = 'BSD (3 clause)'
+
+
+def get_github_url(app, view, path):
+    return 'https://github.com/{project}/{view}/{branch}/doc/{path}'.format(
+        project=app.config.edit_on_github_project,
+        view=view,
+        branch=app.config.edit_on_github_branch,
+        path=path)
+
+
+def html_page_context(app, pagename, templatename, context, doctree):
+    if templatename != 'page.html':
+        return
+
+    if not app.config.edit_on_github_project:
+        warnings.warn("edit_on_github_project not specified")
+        return
+
+    path = os.path.relpath(doctree.get('source'), app.builder.srcdir)
+    show_url = get_github_url(app, 'blob', path)
+    edit_url = get_github_url(app, 'edit', path)
+
+    context['show_on_github_url'] = show_url
+    context['edit_on_github_url'] = edit_url
+
+def setup(app):
+    app.add_config_value('edit_on_github_project', '', True)
+    app.add_config_value('edit_on_github_branch', 'master', True)
+    app.connect('html-page-context', html_page_context)
diff --git a/ceph/doc/_static/js/ceph.js b/ceph/doc/_static/js/ceph.js

new file mode 100644 (file)

index 0000000..61f95fb
--- /dev/null
+++ b/ceph/doc/_static/js/ceph.js
@@ -0,0 +1,41 @@
+$(function() {
+  var releases_url = "http://docs.ceph.com/docs/master/releases.json";
+
+  function show_edit(branch, data) {
+    if (branch) {
+      if (branch === "master") {
+        $("#dev-warning").show();
+        return true;
+      }
+      if (data && data.releases && branch in data.releases) {
+        var eol = ("actual_eol" in data.releases[branch]);
+        if (eol) {
+          $("#eol-warning").show();
+        }
+        return !eol;
+      }
+    }
+    $("#dev-warning").show();
+    return false;
+  }
+
+  function get_branch() {
+    var url = window.location.href;
+    var res = url.match(/docs.ceph.com\/docs\/([a-z]+)\/?/i)
+    if (res) {
+      return res[1]
+    }
+    return null;
+  }
+
+  $.getJSON(releases_url, function(data) {
+    var branch = get_branch();
+    if (show_edit(branch, data)) {
+      // patch the edit-on-github URL for correct branch
+      var url = $("#edit-on-github").attr("href");
+      url = url.replace("master", branch);
+      $("#edit-on-github").attr("href", url);
+      $("#docubetter").show();
+    }
+  });
+});
diff --git a/ceph/doc/_templates/page.html b/ceph/doc/_templates/page.html

new file mode 100644 (file)

index 0000000..914a752
--- /dev/null
+++ b/ceph/doc/_templates/page.html
@@ -0,0 +1,21 @@
+{% extends "!page.html" %}
+{% block body %}
+
+<div id="dev-warning" class="admonition note" style="display:none;">
+  <p class="first admonition-title">Notice</p>
+  <p class="last">This document is for a development version of Ceph.</p>
+</div>
+
+<div id="eol-warning" class="admonition warning" style="display:none;">
+  <p class="first admonition-title">Warning</p>
+  <p class="last">This document is for an unsupported version of Ceph.</p>
+</div>
+
+{%- if edit_on_github_url %}
+  <div id="docubetter" align="right" style="display:none; padding: 15px; font-weight: bold;">
+    <a id="edit-on-github" href="{{ edit_on_github_url }}" rel="nofollow">{{ _('Edit on GitHub')}}</a> | <a href="https://github.com/ceph/ceph/projects/4">Report a Documentation Bug</a>
+  </div>
+{%- endif %}
+
+  {{ super() }}
+{% endblock %}
diff --git a/ceph/doc/ceph-volume/lvm/zap.rst b/ceph/doc/ceph-volume/lvm/zap.rst

index 2236ad4efbc44323bf9b918ff38c9da64c0f8177..367d7469308a1a12aa0a0e291e84b62be0cb83aa 100644 (file)
--- a/ceph/doc/ceph-volume/lvm/zap.rst
+++ b/ceph/doc/ceph-volume/lvm/zap.rst
@@ -15,18 +15,51 @@ on the given lv or partition will be removed and all data will be purged.
  
  Zapping a logical volume::
  
-      ceph-volume lvm zap {vg name/lv name}
+    ceph-volume lvm zap {vg name/lv name}
  
  Zapping a partition::
  
-      ceph-volume lvm zap /dev/sdc1
+    ceph-volume lvm zap /dev/sdc1
  
-If you are zapping a raw device or partition and would like any vgs or lvs created
-from that device removed use the ``--destroy`` flag. A common use case is to simply
-deploy OSDs using a whole raw device. If you do so and then wish to reuse that device for
-another OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs that
-ceph-volume created on the raw device will be removed.
+Removing Devices
+----------------
+When zapping, and looking for full removal of the device (lv, vg, or partition)
+use the ``--destroy`` flag. A common use case is to simply deploy OSDs using
+a whole raw device. If you do so and then wish to reuse that device for another
+OSD you must use the ``--destroy`` flag when zapping so that the vgs and lvs
+that ceph-volume created on the raw device will be removed.
+
+.. note:: Multiple devices can be accepted at once, to zap them all
  
  Zapping a raw device and destroying any vgs or lvs present::
  
-      ceph-volume lvm zap /dev/sdc --destroy
+    ceph-volume lvm zap /dev/sdc --destroy
+
+
+This action can be performed on partitions, and logical volumes as well::
+
+    ceph-volume lvm zap /dev/sdc1 --destroy
+    ceph-volume lvm zap osd-vg/data-lv --destroy
+
+
+Finally, multiple devices can be detected if filtering by OSD ID and/or OSD
+FSID. Either identifier can be used or both can be used at the same time. This
+is useful in situations where multiple devices associated with a specific ID
+need to be purged. When using the FSID, the filtering is stricter, and might
+not match other (possibly invalid) devices associated to an ID.
+
+By ID only::
+
+    ceph-volume lvm zap --destroy --osd-id 1
+
+By FSID::
+
+    ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE
+
+By both::
+
+    ceph-volume lvm zap --destroy --osd-fsid 2E8FBE58-0328-4E3B-BFB7-3CACE4E9A6CE --osd-id 1
+
+
+.. warning:: If the systemd unit associated with the OSD ID to be zapped is
+             detected as running, the tool will refuse to zap until the daemon is stopped.
diff --git a/ceph/doc/cephfs/dirfrags.rst b/ceph/doc/cephfs/dirfrags.rst

index 717553fea9afc2e3128b68e7cf0fb26eeacc46a4..24b05edfc26841ff0b2cd3d149fdbe706bccd4cd 100644 (file)
--- a/ceph/doc/cephfs/dirfrags.rst
+++ b/ceph/doc/cephfs/dirfrags.rst
@@ -25,10 +25,9 @@ fragments may be *merged* to reduce the number of fragments in the directory.
  Splitting and merging
  =====================
  
-An MDS will only consider doing splits and merges if the ``mds_bal_frag``
-setting is true in the MDS's configuration file, and the allow_dirfrags
-setting is true in the filesystem map (set on the mons).  These settings
-are both true by default since the *Luminous* (12.2.x) release of Ceph.
+An MDS will only consider doing splits if the allow_dirfrags setting is true in
+the file system map (set on the mons).  This setting is true by default since
+the *Luminous* release (12.2.X).
  
  When an MDS identifies a directory fragment to be split, it does not
  do the split immediately.  Because splitting interrupts metadata IO,
diff --git a/ceph/doc/cephfs/eviction.rst b/ceph/doc/cephfs/eviction.rst

index 8f0f20b8448c300ce8fa4fb7df7c79ef5f77e33c..e803da179c2ac976032071a696c5d5551f24b2ae 100644 (file)
--- a/ceph/doc/cephfs/eviction.rst
+++ b/ceph/doc/cephfs/eviction.rst
@@ -23,9 +23,9 @@ Automatic client eviction
  
  There are three situations in which a client may be evicted automatically:
  
-On an active MDS daemon, if a client has not communicated with the MDS for
-over ``mds_session_autoclose`` seconds (300 seconds by default), then it
-will be evicted automatically.
+On an active MDS daemon, if a client has not communicated with the MDS for over
+``session_autoclose`` (a file system variable) seconds (300 seconds by
+default), then it will be evicted automatically.
  
  On an active MDS daemon, if a client has not responded to cap revoke messages
  for over ``mds_cap_revoke_eviction_timeout`` (configuration option) seconds.
diff --git a/ceph/doc/cephfs/fuse.rst b/ceph/doc/cephfs/fuse.rst

index 02a4d485c1750c042238aeee9ecc480c40b38b33..251253703a07c3bbba4ca35efffa35c3e3b67f71 100644 (file)
--- a/ceph/doc/cephfs/fuse.rst
+++ b/ceph/doc/cephfs/fuse.rst
@@ -26,7 +26,7 @@ For additional details on ``cephx`` configuration, see
  To mount the Ceph file system as a FUSE, you may use the ``ceph-fuse`` command.
  For example::
  
-       sudo mkdir /home/usernname/cephfs
+       sudo mkdir /home/username/cephfs
         sudo ceph-fuse -m 192.168.0.1:6789 /home/username/cephfs
  
  If you have more than one filesystem, specify which one to mount using
@@ -48,5 +48,5 @@ A persistent mount point can be setup via::
         sudo systemctl enable ceph-fuse@/mnt.service
  
  .. _ceph-fuse: ../../man/8/ceph-fuse/
-.. _fstab: ./fstab
+.. _fstab: ../fstab/#fuse
  .. _CEPHX Config Reference: ../../rados/configuration/auth-config-ref
diff --git a/ceph/doc/cephfs/health-messages.rst b/ceph/doc/cephfs/health-messages.rst

index 7b82c2f87633ce0254d5b9bddb07b8e30bf970f7..3a6217c7b6907829d5ae5dd59dcad15de44fef58 100644 (file)
--- a/ceph/doc/cephfs/health-messages.rst
+++ b/ceph/doc/cephfs/health-messages.rst
@@ -67,7 +67,7 @@ are like locks.  Sometimes, for example when another client needs access,
  the MDS will request clients release their capabilities.  If the client
  is unresponsive or buggy, it might fail to do so promptly or fail to do
  so at all.  This message appears if a client has taken longer than
-``mds_session_timeout`` (default 60s) to comply.
+``session_timeout`` (default 60s) to comply.
  
  Message: "Client *name* failing to respond to cache pressure"
  Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
diff --git a/ceph/doc/cephfs/mds-config-ref.rst b/ceph/doc/cephfs/mds-config-ref.rst

index 2fd47ae334f8ae153fc0e0ad8a505afa985b98d5..70a97c90f5b6923dcbe3fb70847b45925005e1b8 100644 (file)
--- a/ceph/doc/cephfs/mds-config-ref.rst
+++ b/ceph/doc/cephfs/mds-config-ref.rst
@@ -10,15 +10,6 @@
  :Type: Boolean
  :Default: ``true`` 
  
-
-``mds max file size``
-
-:Description: The maximum allowed file size to set when creating a 
-              new file system.
-
-:Type:  64-bit Integer Unsigned
-:Default:  ``1ULL << 40``
-
  ``mds cache memory limit``
  
  :Description: The memory limit the MDS should enforce for its cache.
@@ -102,24 +93,6 @@
  :Default: ``24.0*60.0``
  
  
-``mds session timeout``
-
-:Description: The interval (in seconds) of client inactivity before Ceph 
-              times out capabilities and leases.
-              
-:Type:  Float
-:Default: ``60``
-
-
-``mds session autoclose``
-
-:Description: The interval (in seconds) before Ceph closes 
-              a laggy client's session.
-              
-:Type:  Float
-:Default: ``300``
-
-
  ``mds reconnect timeout``
  
  :Description: The interval (in seconds) to wait for clients to reconnect 
@@ -249,13 +222,6 @@
  :Default: ``0``
  
  
-``mds bal frag``
-
-:Description: Determines whether the MDS will fragment directories.
-:Type:  Boolean
-:Default:  ``false``
-
-
  ``mds bal split size``
  
  :Description: The maximum directory size before the MDS will split a directory 
diff --git a/ceph/doc/conf.py b/ceph/doc/conf.py

index ce1e5af9754f8a8cd0742726c55a11e15f4f915a..6bd56ba40075fbd0eae949f40c9da172266ac137 100644 (file)
--- a/ceph/doc/conf.py
+++ b/ceph/doc/conf.py
@@ -33,16 +33,20 @@ html_logo = 'logo.png'
  html_favicon = 'favicon.ico'
  html_use_smartypants = True
  html_show_sphinx = False
+html_static_path = ["_static"]
  html_sidebars = {
      '**': ['smarttoc.html', 'searchbox.html'],
      }
  
+sys.path.insert(0, os.path.abspath('_ext'))
+
  extensions = [
      'sphinx.ext.autodoc',
      'sphinx.ext.graphviz',
      'sphinx.ext.todo',
      'sphinxcontrib.ditaa',
      'breathe',
+    'edit_on_github',
      ]
  ditaa = 'ditaa'
  todo_include_todos = True
@@ -66,3 +70,13 @@ breathe_domain_by_extension = {'py': 'py', 'c': 'c', 'h': 'c', 'cc': 'cxx', 'hpp
  pybind = os.path.join(top_level, 'src/pybind')
  if pybind not in sys.path:
      sys.path.insert(0, pybind)
+
+# the docs are rendered with github links pointing to master. the javascript
+# snippet in _static/ceph.js rewrites the edit links when a page is loaded, to
+# point to the correct branch.
+edit_on_github_project = 'ceph/ceph'
+edit_on_github_branch = 'master'
+
+# handles edit-on-github and old version warning display
+def setup(app):
+    app.add_javascript('js/ceph.js')
diff --git a/ceph/doc/man/8/ceph-volume.rst b/ceph/doc/man/8/ceph-volume.rst

index af5775997fcb138e94777192fba074f8f91e3b9f..9ad5a5237af2dc92c17dfd2e5d7502104d2db672 100644 (file)
--- a/ceph/doc/man/8/ceph-volume.rst
+++ b/ceph/doc/man/8/ceph-volume.rst
@@ -226,6 +226,17 @@ Usage, for logical partitions::
  
        ceph-volume lvm zap /dev/sdc1
  
+For full removal of the device use the ``--destroy`` flag (allowed for all
+device types)::
+
+      ceph-volume lvm zap --destroy /dev/sdc1
+
+Multiple devices can be removed by specifying the OSD ID and/or the OSD FSID::
+
+      ceph-volume lvm zap --destroy --osd-id 1
+      ceph-volume lvm zap --destroy --osd-id 1 --osd-fsid C9605912-8395-4D76-AFC0-7DFDAC315D59
+
+
  Positional arguments:
  
  * <DEVICE>  Either in the form of ``vg/lv`` for logical volumes,
diff --git a/ceph/doc/man/8/crushtool.rst b/ceph/doc/man/8/crushtool.rst

index 897f62ec41daa5c81c681bb712f5215105b862fa..c5fae504a5f56945eec585d8270a4a4ce766797d 100644 (file)
--- a/ceph/doc/man/8/crushtool.rst
+++ b/ceph/doc/man/8/crushtool.rst
@@ -258,6 +258,14 @@ creating a new Ceph cluster. They can be further edited with::
         # recompile
         crushtool -c map.txt -o crushmap
  
+Reclassify
+==========
+
+The *reclassify* function allows users to transition from older maps that
+maintain parallel hierarchies for OSDs of different types to a modern CRUSH
+map that makes use of the *device class* feature.  For more information,
+see http://docs.ceph.com/docs/master/rados/operations/crush-map-edits/#migrating-from-a-legacy-ssd-rule-to-device-classes.
+
  Example output from --test
  ==========================
  
diff --git a/ceph/doc/mgr/balancer.rst b/ceph/doc/mgr/balancer.rst

index 191c45593be8a23d8287a307d0d4d693958a4d26..f3cb86f7ddd4a65901b7ffd0cab02ed77c988831 100644 (file)
--- a/ceph/doc/mgr/balancer.rst
+++ b/ceph/doc/mgr/balancer.rst
@@ -129,6 +129,10 @@ The name is provided by the user and can be any useful identifying string.  The
  
    ceph balancer show <plan-name>
  
+All plans can be shown with::
+
+  ceph balancer ls
+
  Old plans can be discarded with::
  
    ceph balancer rm <plan-name>
diff --git a/ceph/doc/rados/configuration/bluestore-config-ref.rst b/ceph/doc/rados/configuration/bluestore-config-ref.rst

index 542ba151a1cb46c317b30e10f4a0362874f4aaa9..d7e70ee92c6bed520edeb5c430af951f10001c3d 100644 (file)
--- a/ceph/doc/rados/configuration/bluestore-config-ref.rst
+++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst
@@ -51,8 +51,164 @@ To specify a WAL device and/or DB device, ::
  
    ceph-disk prepare --bluestore <device> --block.wal <wal-device> --block-db <db-device>
  
-Cache size
-==========
+Provisioning strategies
+-----------------------
+Although there are multiple ways to deploy a Bluestore OSD (unlike Filestore
+which had 1) here are two common use cases that should help clarify the
+initial deployment strategy:
+
+.. _bluestore-single-type-device-config:
+
+**block (data) only**
+^^^^^^^^^^^^^^^^^^^^^
+If all the devices are the same type, for example all are spinning drives, and
+there are no fast devices to combine these, it makes sense to just deploy with
+block only and not try to separate ``block.db`` or ``block.wal``. The
+:ref:`ceph-volume-lvm` call for a single ``/dev/sda`` device would look like::
+
+    ceph-volume lvm create --bluestore --data /dev/sda
+
+If logical volumes have already been created for each device (1 LV using 100%
+of the device), then the :ref:`ceph-volume-lvm` call for an lv named
+``ceph-vg/block-lv`` would look like::
+
+    ceph-volume lvm create --bluestore --data ceph-vg/block-lv
+
+.. _bluestore-mixed-device-config:
+
+**block and block.db**
+^^^^^^^^^^^^^^^^^^^^^^
+If there is a mix of fast and slow devices (spinning and solid state),
+it is recommended to place ``block.db`` on the faster device while ``block``
+(data) lives on the slower (spinning drive). Sizing for ``block.db`` should be
+as large as possible to avoid performance penalties otherwise. The
+``ceph-volume`` tool is currently not able to create these automatically, so
+the volume groups and logical volumes need to be created manually.
+
+For the below example, lets assume 4 spinning drives (sda, sdb, sdc, and sdd)
+and 1 solid state drive (sdx). First create the volume groups::
+
+    $ vgcreate ceph-block-0 /dev/sda
+    $ vgcreate ceph-block-1 /dev/sdb
+    $ vgcreate ceph-block-2 /dev/sdc
+    $ vgcreate ceph-block-3 /dev/sdd
+
+Now create the logical volumes for ``block``::
+
+    $ lvcreate -l 100%FREE -n block-0 ceph-block-0
+    $ lvcreate -l 100%FREE -n block-1 ceph-block-1
+    $ lvcreate -l 100%FREE -n block-2 ceph-block-2
+    $ lvcreate -l 100%FREE -n block-3 ceph-block-3
+
+We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
+SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB::
+
+    $ vgcreate ceph-db-0 /dev/sdx
+    $ lvcreate -L 50GB -n db-0 ceph-db-0
+    $ lvcreate -L 50GB -n db-1 ceph-db-0
+    $ lvcreate -L 50GB -n db-2 ceph-db-0
+    $ lvcreate -L 50GB -n db-3 ceph-db-0
+
+Finally, create the 4 OSDs with ``ceph-volume``::
+
+    $ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
+    $ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
+    $ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
+    $ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
+
+These operations should end up creating 4 OSDs, with ``block`` on the slower
+spinning drives and a 50GB logical volume for each coming from the solid state
+drive.
+
+Sizing
+======
+When using a :ref:`mixed spinning and solid drive setup
+<bluestore-mixed-device-config>` it is important to make a large-enough
+``block.db`` logical volume for Bluestore. Generally, ``block.db`` should have
+*as large as possible* logical volumes.
+
+It is recommended that the ``block.db`` size isn't smaller than 4% of
+``block``. For example, if the ``block`` size is 1TB, then ``block.db``
+shouldn't be less than 40GB.
+
+If *not* using a mix of fast and slow devices, it isn't required to create
+separate logical volumes for ``block.db`` (or ``block.wal``). Bluestore will
+automatically manage these within the space of ``block``.
+
+
+Automatic Cache Sizing
+======================
+
+Bluestore can be configured to automatically resize it's caches when tc_malloc
+is configured as the memory allocator and the ``bluestore_cache_autotune``
+setting is enabled.  This option is currently enabled by default.  Bluestore
+will attempt to keep OSD heap memory usage under a designated target size via
+the ``osd_memory_target`` configuration option.  This is a best effort
+algorithm and caches will not shrink smaller than the amount specified by
+``osd_memory_cache_min``.  Cache ratios will be chosen based on a hierarchy
+of priorities.  If priority information is not availabe, the
+``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
+used as fallbacks.
+
+``bluestore_cache_autotune``
+
+:Description: Automatically tune the ratios assigned to different bluestore caches while respecting minimum values.
+:Type: Boolean
+:Requered: Yes
+:Default: ``True``
+
+``osd_memory_target``
+
+:Description: When tcmalloc is available and cache autotuning is enabled, try to keep this many bytes mapped in memory. Note: This may not exactly match the RSS memory usage of the process.  While the total amount of heap memory mapped by the process should generally stay close to this target, there is no guarantee that the kernel will actually reclaim  memory that has been unmapped.  During initial developement, it was found that some kernels result in the OSD's RSS Memory exceeding the mapped memory by up to 20%.  It is hypothesised however, that the kernel generally may be more aggressive about reclaiming unmapped memory when there is a high amount of memory pressure.  Your mileage may vary.
+:Type: Unsigned Integer
+:Requered: Yes
+:Default: ``4294967296``
+
+``bluestore_cache_autotune_chunk_size``
+
+:Description: The chunk size in bytes to allocate to caches when cache autotune is enabled.  When the autotuner assigns memory to different caches, it will allocate memory in chunks.  This is done to avoid evictions when there are minor fluctuations in the heap size or autotuned cache ratios.
+:Type: Unsigned Integer
+:Requered: No
+:Default: ``33554432``
+
+``bluestore_cache_autotune_interval``
+
+:Description: The number of seconds to wait between rebalances when cache autotune is enabled.  This setting changes how quickly the ratios of the difference caches are recomputed.  Note:  Setting the interval too small can result in high CPU usage and lower performance.
+:Type: Float
+:Requered: No
+:Default: ``5``
+
+``osd_memory_base``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need.  This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Unsigned Interger
+:Required: No
+:Default: ``805306368``
+
+``osd_memory_expected_fragmentation``
+
+:Description: When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation.  This is used to help the autotuner estimate the expected aggregate memory consumption of the caches.
+:Type: Float
+:Required: No
+:Default: ``0.15``
+
+``osd_memory_cache_min``
+
+:Description: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches. Note: Setting this value too low can result in significant cache thrashing.
+:Type: Unsigned Integer
+:Required: No
+:Default: ``134217728``
+
+``osd_memory_cache_resize_interval``
+
+:Description: When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches.  This setting changes the total amount of memory available for bluestore to use for caching.  Note: Setting the interval too small can result in memory allocator thrashing and lower performance.
+:Type: Float
+:Required: No
+:Default: ``1``
+
+
+Manual Cache Sizing
+===================
  
  The amount of memory consumed by each OSD for BlueStore's cache is
  determined by the ``bluestore_cache_size`` configuration option.  If
diff --git a/ceph/doc/rados/operations/add-or-rm-mons.rst b/ceph/doc/rados/operations/add-or-rm-mons.rst

index 0cdc4313cc2586c9c14e36151d06523db805ce3d..20cba1bcabe9df59e99d52db2bae3fd10bcd7812 100644 (file)
--- a/ceph/doc/rados/operations/add-or-rm-mons.rst
+++ b/ceph/doc/rados/operations/add-or-rm-mons.rst
@@ -1,3 +1,5 @@
+.. _adding-and-removing-monitors:
+
  ==========================
   Adding/Removing Monitors
  ==========================
@@ -6,6 +8,8 @@ When you have a cluster up and running, you may add or remove monitors
  from the cluster at runtime. To bootstrap a monitor, see `Manual Deployment`_
  or `Monitor Bootstrap`_.
  
+.. _adding-monitors:
+
  Adding Monitors
  ===============
  
@@ -121,6 +125,7 @@ on ``mon.a``).
  
         ceph-mon -i {mon-id} --public-addr {ip:port}
  
+.. _removing-monitors:
  
  Removing Monitors
  =================
diff --git a/ceph/doc/rados/operations/crush-map-edits.rst b/ceph/doc/rados/operations/crush-map-edits.rst

index 36a90208317bf2053eecdd1922854a885b6a2abf..64d37c7144869f83edfc363e570e0a0590d7e0ae 100644 (file)
--- a/ceph/doc/rados/operations/crush-map-edits.rst
+++ b/ceph/doc/rados/operations/crush-map-edits.rst
@@ -475,144 +475,161 @@ A rule takes the following form::
  .. important:: A given CRUSH rule may be assigned to multiple pools, but it
     is not possible for a single pool to have multiple CRUSH rules.
  
-
-Placing Different Pools on Different OSDS:
-==========================================
-
-Suppose you want to have most pools default to OSDs backed by large hard drives,
-but have some pools mapped to OSDs backed by fast solid-state drives (SSDs).
-It's possible to have multiple independent CRUSH hierarchies within the same
-CRUSH map. Define two hierarchies with two different root nodes--one for hard
-disks (e.g., "root platter") and one for SSDs (e.g., "root ssd") as shown
-below::
-
-  device 0 osd.0
-  device 1 osd.1
-  device 2 osd.2
-  device 3 osd.3
-  device 4 osd.4
-  device 5 osd.5
-  device 6 osd.6
-  device 7 osd.7
-
-       host ceph-osd-ssd-server-1 {
-               id -1
-               alg straw
-               hash 0
-               item osd.0 weight 1.00
-               item osd.1 weight 1.00
-       }
-
-       host ceph-osd-ssd-server-2 {
-               id -2
-               alg straw
-               hash 0
-               item osd.2 weight 1.00
-               item osd.3 weight 1.00
-       }
-
-       host ceph-osd-platter-server-1 {
-               id -3
-               alg straw
-               hash 0
-               item osd.4 weight 1.00
-               item osd.5 weight 1.00
-       }
-
-       host ceph-osd-platter-server-2 {
-               id -4
-               alg straw
-               hash 0
-               item osd.6 weight 1.00
-               item osd.7 weight 1.00
-       }
-
-       root platter {
-               id -5
-               alg straw
-               hash 0
-               item ceph-osd-platter-server-1 weight 2.00
-               item ceph-osd-platter-server-2 weight 2.00
-       }
-
-       root ssd {
-               id -6
-               alg straw
-               hash 0
-               item ceph-osd-ssd-server-1 weight 2.00
-               item ceph-osd-ssd-server-2 weight 2.00
-       }
-
-       rule data {
-               ruleset 0
-               type replicated
-               min_size 2
-               max_size 2
-               step take platter
-               step chooseleaf firstn 0 type host
-               step emit
-       }
-
-       rule metadata {
-               ruleset 1
-               type replicated
-               min_size 0
-               max_size 10
-               step take platter
-               step chooseleaf firstn 0 type host
-               step emit
-       }
-
-       rule rbd {
-               ruleset 2
-               type replicated
-               min_size 0
-               max_size 10
-               step take platter
-               step chooseleaf firstn 0 type host
-               step emit
-       }
-
-       rule platter {
-               ruleset 3
-               type replicated
-               min_size 0
-               max_size 10
-               step take platter
-               step chooseleaf firstn 0 type host
-               step emit
-       }
-
-       rule ssd {
-               ruleset 4
-               type replicated
-               min_size 0
-               max_size 4
-               step take ssd
-               step chooseleaf firstn 0 type host
-               step emit
-       }
-
-       rule ssd-primary {
-               ruleset 5
-               type replicated
-               min_size 5
-               max_size 10
-               step take ssd
-               step chooseleaf firstn 1 type host
-               step emit
-               step take platter
-               step chooseleaf firstn -1 type host
-               step emit
-       }
-
-You can then set a pool to use the SSD rule by::
-
-  ceph osd pool set <poolname> crush_ruleset 4
-
-Similarly, using the ``ssd-primary`` rule will cause each placement group in the
-pool to be placed with an SSD as the primary and platters as the replicas.
-
+.. _crush-reclassify:
+
+Migrating from a legacy SSD rule to device classes
+--------------------------------------------------
+
+It used to be necessary to manually edit your CRUSH map and maintain a
+parallel hierarchy for each specialized device type (e.g., SSD) in order to
+write rules that apply to those devices.  Since the Luminous release,
+the *device class* feature has enabled this transparently.
+
+However, migrating from an existing, manually customized per-device map to
+the new device class rules in the trivial way will cause all data in the
+system to be reshuffled.
+
+The ``crushtool`` has a few commands that can transform a legacy rule
+and hierarchy so that you can start using the new class-based rules.
+There are three types of transformations possible:
+
+#. ``--reclassify-root <root-name> <device-class>``
+
+   This will take everything in the hierarchy beneath root-name and
+   adjust any rules that reference that root via a ``take
+   <root-name>`` to instead ``take <root-name> class <device-class>``.
+   It renumbers the buckets in such a way that the old IDs are instead
+   used for the specified class's "shadow tree" so that no data
+   movement takes place.
+
+   For example, imagine you have an existing rule like::
+
+     rule replicated_ruleset {
+        id 0
+        type replicated
+        min_size 1
+        max_size 10
+        step take default
+        step chooseleaf firstn 0 type rack
+        step emit
+     }
+
+   If you reclassify the root `default` as class `hdd`, the rule will
+   become::
+
+     rule replicated_ruleset {
+        id 0
+        type replicated
+        min_size 1
+        max_size 10
+        step take default class hdd
+        step chooseleaf firstn 0 type rack
+        step emit
+     }
+
+#. ``--set-subtree-class <bucket-name> <device-class>``
+
+   This will mark every device in the subtree rooted at *bucket-name*
+   with the specified device class.
+
+   This is normally used in conjunction with the ``--reclassify-root``
+   option to ensure that all devices in that root are labeled with the
+   correct class.  In some situations, however, some of those devices
+   (correctly) have a different class and we do not want to relabel
+   them.  In such cases, one can exclude the ``--set-subtree-class``
+   option.  This means that the remapping process will not be perfect,
+   since the previous rule distributed across devices of multiple
+   classes but the adjusted rules will only map to devices of the
+   specified *device-class*, but that often is an accepted level of
+   data movement when the nubmer of outlier devices is small.
+
+#. ``--reclassify-bucket <match-pattern> <device-class> <default-parent>``
+
+   This will allow you to merge a parallel type-specific hiearchy with the normal hierarchy.  For example, many users have maps like::
+
+     host node1 {
+        id -2           # do not change unnecessarily
+        # weight 109.152
+        alg straw
+        hash 0  # rjenkins1
+        item osd.0 weight 9.096
+        item osd.1 weight 9.096
+        item osd.2 weight 9.096
+        item osd.3 weight 9.096
+        item osd.4 weight 9.096
+        item osd.5 weight 9.096
+        ...
+     }
+
+     host node1-ssd {
+        id -10          # do not change unnecessarily
+        # weight 2.000
+        alg straw
+        hash 0  # rjenkins1
+        item osd.80 weight 2.000
+       ...
+     }
+
+     root default {
+        id -1           # do not change unnecessarily
+        alg straw
+        hash 0  # rjenkins1
+        item node1 weight 110.967
+        ...
+     }
+
+     root ssd {
+        id -18          # do not change unnecessarily
+        # weight 16.000
+        alg straw
+        hash 0  # rjenkins1
+        item node1-ssd weight 2.000
+       ...
+     }
+
+   This function will reclassify each bucket that matches a
+   pattern.  The pattern can look like ``%suffix`` or ``prefix%``.
+   For example, in the above example, we would use the pattern
+   ``%-ssd``.  For each matched bucket, the remaining portion of the
+   name (that matches the ``%`` wildcard) specifies the *base bucket*.
+   All devices in the matched bucket are labeled with the specified
+   device class and then moved to the base bucket.  If the base bucket
+   does not exist (e.g., ``node12-ssd`` exists but ``node12`` does
+   not), then it is created and linked underneath the specified
+   *default parent* bucket.  In each case, we are careful to preserve
+   the old bucket IDs for the new shadow buckets to prevent data
+   movement.  Any rules with ``take`` steps referencing the old
+   buckets are adjusted.
+
+#. ``--reclassify-bucket <bucket-name> <device-class> <base-bucket>``
+
+   The same command can also be used without a wildcard to map a
+   single bucket.  For example, in the previous example, we want the
+   ``ssd`` bucket to be mapped to the ``default`` bucket.
+
+The final command to convert the map comprised of the above fragments would be something like::
+
+  $ ceph osd getcrushmap -o original
+  $ crushtool -i original --reclassify \
+      --set-subtree-class default hdd \
+      --reclassify-root default hdd \
+      --reclassify-bucket %-ssd ssd default \
+      --reclassify-bucket ssd ssd default \
+      -o adjusted
+
+In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs to the CRUSH map and ensure that the same result comes back out.  These inputs are controlled by the same options that apply to the ``--test`` command.  For the above example,::
+
+  $ crushtool -i original --compare adjusted
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+If there were difference, you'd see what ratio of inputs are remapped
+in the parentheses.
+
+If you are satisfied with the adjusted map, you can apply it to the cluster with something like::
+
+  ceph osd setcrushmap -i adjusted
  
  Tuning CRUSH, the hard way
  --------------------------
diff --git a/ceph/doc/rados/operations/crush-map.rst b/ceph/doc/rados/operations/crush-map.rst

index 05fa4ff691aef6a21590b1e796f3c7286224c31b..e9d6673440bcd9a0102643ac4a794794ae636848 100644 (file)
--- a/ceph/doc/rados/operations/crush-map.rst
+++ b/ceph/doc/rados/operations/crush-map.rst
@@ -243,6 +243,11 @@ with::
  
    ceph osd crush tree --show-shadow
  
+For older clusters created before Luminous that relied on manually
+crafted CRUSH maps to maintain per-device-type hierarchies, there is a
+*reclassify* tool available to help transition to device classes
+without triggering data movement (see :ref:`crush-reclassify`).
+
  
  Weights sets
  ------------
diff --git a/ceph/doc/rados/operations/user-management.rst b/ceph/doc/rados/operations/user-management.rst

index 8a35a501ab110dfc9ddbac9c566801bd109da869..8c08741073cba1ae38c7298d3c885e8c4bad4906 100644 (file)
--- a/ceph/doc/rados/operations/user-management.rst
+++ b/ceph/doc/rados/operations/user-management.rst
@@ -387,12 +387,6 @@ For example::
         ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
         ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
  
-To remove a capability, you may reset the capability. If you want the user
-to have no access to a particular daemon that was previously set, specify 
-an empty string. For example:: 
-
-       ceph auth caps client.ringo mon ' ' osd ' '
-
  See `Authorization (Capabilities)`_ for additional details on capabilities.
  
  
diff --git a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst

index 89fb94c32f998bd57bb4a30c38ee565a88c374e8..642b2e07bfd18f0584e099a5d8cc4d86a5b9d9cd 100644 (file)
--- a/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
+++ b/ceph/doc/rados/troubleshooting/troubleshooting-mon.rst
@@ -402,8 +402,8 @@ or::
  Recovery using healthy monitor(s)
  ---------------------------------
  
-If there is any survivers, we can always `replace`_ the corrupted one with a
-new one. And after booting up, the new joiner will sync up with a healthy
+If there are any survivors, we can always :ref:`replace <adding-and-removing-monitors>` the corrupted one with a
+new one. After booting up, the new joiner will sync up with a healthy
  peer, and once it is fully sync'ed, it will be able to serve the clients.
  
  Recovery using OSDs
@@ -563,5 +563,4 @@ Finally, you should reach out to us on the mailing lists, on IRC or file
  a new issue on the `tracker`_.
  
  .. _cluster map: ../../architecture#cluster-map
-.. _replace: ../operation/add-or-rm-mons
  .. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
diff --git a/ceph/doc/radosgw/adminops.rst b/ceph/doc/radosgw/adminops.rst

index 5da13a8b944a883a128f92b61344863d3921a2c0..16efd5f848375ee016c0f5858bca0804d46eb867 100644 (file)
--- a/ceph/doc/radosgw/adminops.rst
+++ b/ceph/doc/radosgw/adminops.rst
@@ -1858,8 +1858,9 @@ Valid parameters for quotas include:
    the maximum number of objects. A negative value disables this setting.
    
  - **Maximum Size:** The ``max-size`` option allows you to specify a quota
-  for the maximum number of bytes. A negative value disables this setting.
-  
+  for the maximum number of bytes. The ``max-size-kb`` option allows you
+  to specify it in KiB. A negative value disables this setting.
+
  - **Quota Type:** The ``quota-type`` option sets the scope for the quota.
    The options are ``bucket`` and ``user``.
  
diff --git a/ceph/doc/radosgw/config-ref.rst b/ceph/doc/radosgw/config-ref.rst

index 45054a9eceb27ad4be56794352e3c4a38b8ad0d3..d86baf12637f8f2227e60028fd4aded88f056b13 100644 (file)
--- a/ceph/doc/radosgw/config-ref.rst
+++ b/ceph/doc/radosgw/config-ref.rst
@@ -576,6 +576,17 @@ Swift Settings
  :Default: ``false``
  
  
+``rgw trust forwarded https``
+
+:Description: When a proxy in front of radosgw is used for ssl termination, radosgw
+              does not know whether incoming http connections are secure. Enable
+              this option to trust the ``Forwarded`` and ``X-Forwarded-Proto`` headers
+              sent by the proxy when determining whether the connection is secure.
+              This is required for some features, such as server side encryption.
+:Type: Boolean
+:Default: ``false``
+
+
  
  Logging Settings
  ================
diff --git a/ceph/doc/radosgw/encryption.rst b/ceph/doc/radosgw/encryption.rst

index a7bb7e2e931b65130c9cd2f4e00c954d75b2a6d1..ea89e502ab0b8e828d34fdcfc32437d3acc56d65 100644 (file)
--- a/ceph/doc/radosgw/encryption.rst
+++ b/ceph/doc/radosgw/encryption.rst
@@ -9,6 +9,11 @@ with 3 options for the management of encryption keys. Server-side encryption
  means that the data is sent over HTTP in its unencrypted form, and the Ceph
  Object Gateway stores that data in the Ceph Storage Cluster in encrypted form.
  
+.. note:: Requests for server-side encryption must be sent over a secure HTTPS
+          connection to avoid sending secrets in plaintext. If a proxy is used
+          for SSL termination, ``rgw trust forwarded https`` must be enabled
+          before forwarded requests will be trusted as secure.
+
  Customer-Provided Keys
  ======================
  
diff --git a/ceph/doc/radosgw/frontends.rst b/ceph/doc/radosgw/frontends.rst

index ff6323ee454b345439cae2f9856af67e6209eeeb..7c0b2cced8309c159aad97c2570884cf1af5e0dd 100644 (file)
--- a/ceph/doc/radosgw/frontends.rst
+++ b/ceph/doc/radosgw/frontends.rst
@@ -18,7 +18,7 @@ and the Boost.Asio library for asynchronous network i/o.
  Options
  -------
  
-``port``
+``port`` and ``ssl_port``
  
  :Description: Sets the listening port number. Can be specified multiple
                times as in ``port=80 port=8000``.
@@ -27,18 +27,37 @@ Options
  :Default: ``80``
  
  
-``endpoint``
+``endpoint`` and ``ssl_endpoint``
  
  :Description: Sets the listening address in the form ``address[:port]``,
                where the address is an IPv4 address string in dotted decimal
-              form, or an IPv6 address in hexadecimal notation. The
-              optional port defaults to 80. Can be specified multiple times
-              as in ``endpoint=::1 endpoint=192.168.0.100:8000``.
+              form, or an IPv6 address in hexadecimal notation surrounded
+              by square brackets. The optional port defaults to 80 for
+              ``endpoint`` and 443 for ``ssl_endpoint``. Can be specified
+              multiple times as in ``endpoint=[::1] endpoint=192.168.0.100:8000``.
  
  :Type: Integer
  :Default: None
  
  
+``ssl_certificate``
+
+:Description: Path to the SSL certificate file used for SSL-enabled endpoints.
+
+:Type: String
+:Default: None
+
+
+``ssl_private_key``
+
+:Description: Optional path to the private key file used for SSL-enabled
+              endpoints. If one is not given, the ``ssl_certificate`` file
+              is used as the private key.
+
+:Type: String
+:Default: None
+
+
  Civetweb
  ========
  
diff --git a/ceph/doc/start/hardware-recommendations.rst b/ceph/doc/start/hardware-recommendations.rst

index eac5dc8c9f0be8444e25ac25808f5797e3eea56d..2ad982e39e6823065c7dbe7a57440b47d6158aa6 100644 (file)
--- a/ceph/doc/start/hardware-recommendations.rst
+++ b/ceph/doc/start/hardware-recommendations.rst
@@ -39,11 +39,29 @@ separate hosts.
  RAM
  ===
  
-Metadata servers and monitors must be capable of serving their data quickly, so
-they should have plenty of RAM (e.g., 1GB of RAM per daemon instance). OSDs do
-not require as much RAM for regular operations (e.g., 500MB of RAM per daemon
-instance); however, during recovery they need significantly more RAM (e.g., ~1GB
-per 1TB of storage per daemon). Generally, more RAM is better.
+Generally, more RAM is better.
+
+Monitors and managers (ceph-mon and ceph-mgr)
+---------------------------------------------
+
+Monitor and manager daemon memory usage generally scales with the size of the
+cluster.  For small clusters, 1-2 GB is generally sufficient.  For
+large clusters, you should provide more (5-10 GB).  You may also want
+to consider tuning settings like ``mon_osd_cache_size`` or
+``rocksdb_cache_size``.
+
+Metadata servers (ceph-mds)
+---------------------------
+
+The metadata daemon memory utilization depends on how much memory its cache is
+configured to consume.  We recommend 1 GB as a minimum for most systems.  See
+``mds_cache_memory``.
+
+OSDs (ceph-osd)
+---------------
+
+By default, OSDs that use the BlueStore backend require 3-5 GB of RAM.  You can
+adjust the amount of memory the OSD consumes with the ``osd_memory_target`` configuration option when BlueStore is in use.  When using the legacy FileStore backend, the operating system page cache is used for caching data, so no tuning is normally needed, and the OSD memory consumption is generally related to the number of PGs per daemon in the system.
  
  
  Data Storage
diff --git a/ceph/doc/start/quick-ceph-deploy.rst b/ceph/doc/start/quick-ceph-deploy.rst

index 50b7f307f6ef2828e9cca7474913ed1bfeb227c2..dcb01e7a05191a8186ec15c69ee7a73e7ad80162 100644 (file)
--- a/ceph/doc/start/quick-ceph-deploy.rst
+++ b/ceph/doc/start/quick-ceph-deploy.rst
@@ -124,7 +124,7 @@ configuration details, perform the following steps using ``ceph-deploy``.
       ceph-deploy mgr create node1  *Required only for luminous+ builds, i.e >= 12.x builds*
  
  #. Add three OSDs. For the purposes of these instructions, we assume you have an
-   unused disk in each node called ``/dev/vdb``.  *Be sure that the device is not currently in use and does not contain any important data.*
+   unused disk in each node called ``/dev/vdb``.  *Be sure that the device is not currently in use and does not contain any important data.* ::
  
       ceph-deploy osd create {ceph-node}:{device}
  
diff --git a/ceph/examples/librados/Makefile b/ceph/examples/librados/Makefile

index 2b6109c4c6deab367b2f493e9dc9434e1d018348..e51c045a65d5d2a728b27b276bc06bca45e7fd2b 100644 (file)
--- a/ceph/examples/librados/Makefile
+++ b/ceph/examples/librados/Makefile
@@ -3,13 +3,13 @@ CXX?=g++
  CXX_FLAGS?=-std=c++11 -Wall -Wextra -Werror -g
  CXX_LIBS?=-lrados -lradosstriper
  CXX_INC?=$(LOCAL_LIBRADOS_INC)
-CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS) $(CXX_LIBS)
+CXX_CC=$(CXX) $(CXX_FLAGS) $(CXX_INC) $(LOCAL_LIBRADOS)
  
  CC?=gcc
  CC_FLAGS=-Wall -Wextra -Werror -g
  CC_INC=$(LOCAL_LIBRADOS_INC)
  CC_LIBS?=-lrados
-CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS) $(CC_LIBS)
+CC_CC=$(CC) $(CC_FLAGS) $(CC_INC) $(LOCAL_LIBRADOS)
  
  # Relative path to the Ceph source:
  CEPH_SRC_HOME?=../../src
@@ -26,13 +26,13 @@ all-system: LOCAL_LIBRADOS_INC=
  all-system: all
  
  hello_world_cpp: hello_world.cc
-       $(CXX_CC) -o hello_world_cpp hello_world.cc
+       $(CXX_CC) -o hello_world_cpp hello_world.cc $(CXX_LIBS)
  
  hello_radosstriper_cpp: hello_radosstriper.cc
-       $(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc
+       $(CXX_CC) -o hello_radosstriper_cpp hello_radosstriper.cc $(CXX_LIBS)
  
  hello_world_c: hello_world_c.c
-       $(CC_CC) -o hello_world_c hello_world_c.c
+       $(CC_CC) -o hello_world_c hello_world_c.c $(CC_LIBS)
  
  clean:
         rm -f hello_world_cpp hello_radosstriper_cpp hello_world_c
diff --git a/ceph/examples/librados/hello_world.readme b/ceph/examples/librados/hello_world.readme

index d438f932e47f3d9692572e6546bbb511f4fdb67e..afa1cb32e8fa4321b0ffdbc9b9a85af2752cdbad 100644 (file)
--- a/ceph/examples/librados/hello_world.readme
+++ b/ceph/examples/librados/hello_world.readme
@@ -6,7 +6,7 @@ build tree (ie. using relative paths). If you would like to build the examples a
  your system librados and headers, use "make all-system".
  
  And executed using
-./librados_hello_world -c ../../src/ceph.conf
+./hello_world_cpp -c ../../src/ceph.conf
  (or whatever path to a ceph.conf is appropriate to you, or
  by explicitly specifying monitors, user id, and keys).
  
diff --git a/ceph/install-deps.sh b/ceph/install-deps.sh

index 9ead1056d97dfb5f7671e86476f8353d62f78b4d..e73e05f6b534216117d1970cbee76bf5a5102d40 100755 (executable)
--- a/ceph/install-deps.sh
+++ b/ceph/install-deps.sh
@@ -90,6 +90,7 @@ if [ x`uname`x = xFreeBSDx ]; then
          net/socat \
          textproc/expat2 \
          textproc/gsed \
+        lang/gawk \
          textproc/libxml2 \
          textproc/xmlstarlet \
          textproc/jq \
diff --git a/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml

new file mode 100644 (file)

index 0000000..abcfffe
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-1-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-1-client.yaml b/ceph/qa/cephfs/clusters/1-mds-1-client.yaml

index e64b0b88d26a10a2c510a699d2cf1db227e46214..966f0dcc8145910dc5e01e0624063a35f6f286af 100644 (file)
--- a/ceph/qa/cephfs/clusters/1-mds-1-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-1-client.yaml
@@ -5,4 +5,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml

new file mode 100644 (file)

index 0000000..9f0f0dc
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-2-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.1]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 20 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-2-client.yaml b/ceph/qa/cephfs/clusters/1-mds-2-client.yaml

index 006e15a7b23d41435b25bcccaeb88b11adde6a28..656178c0f842cbd733f03b3000bd3a9dce9410ea 100644 (file)
--- a/ceph/qa/cephfs/clusters/1-mds-2-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-2-client.yaml
@@ -6,4 +6,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-3-client.yaml b/ceph/qa/cephfs/clusters/1-mds-3-client.yaml

new file mode 100644 (file)

index 0000000..02e6d6d
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-3-client.yaml
@@ -0,0 +1,15 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [client.0]
+- [client.1]
+- [client.2]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml b/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml

new file mode 100644 (file)

index 0000000..6ff916c
--- /dev/null
+++ b/ceph/qa/cephfs/clusters/1-mds-4-client-coloc.yaml
@@ -0,0 +1,12 @@
+roles:
+- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0, client.1]
+- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.2, client.3]
+openstack:
+- volumes: # attached to each instance
+    count: 4
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/1-mds-4-client.yaml b/ceph/qa/cephfs/clusters/1-mds-4-client.yaml

index a6be36deaf8da26a0bcc593780bb0046cfa5460d..f17c83b823a243dcbfe9abbff92a3789830056d3 100644 (file)
--- a/ceph/qa/cephfs/clusters/1-mds-4-client.yaml
+++ b/ceph/qa/cephfs/clusters/1-mds-4-client.yaml
@@ -8,4 +8,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/3-mds.yaml b/ceph/qa/cephfs/clusters/3-mds.yaml

index c0d463a90d6120654dc6710ed03861d082e2c9d6..f9fc10808657d505d341f5813899699e7421ca7a 100644 (file)
--- a/ceph/qa/cephfs/clusters/3-mds.yaml
+++ b/ceph/qa/cephfs/clusters/3-mds.yaml
@@ -5,4 +5,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/9-mds.yaml b/ceph/qa/cephfs/clusters/9-mds.yaml

index 0bf240272bc9942c087c2281480e8a433c91b316..414fb2ba44dc50a5d6560adfdb89a4082de9677f 100644 (file)
--- a/ceph/qa/cephfs/clusters/9-mds.yaml
+++ b/ceph/qa/cephfs/clusters/9-mds.yaml
@@ -5,4 +5,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
+log-rotate:
+  ceph-mds: 10G
+  ceph-osd: 10G
diff --git a/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml b/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml

index 94948f4c39d0c2ef8eafa899e909a056efe79c0a..129aac6cea2dca37f592a724041feb7bdc1d063a 100644 (file)
--- a/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml
+++ b/ceph/qa/cephfs/clusters/fixed-2-ucephfs.yaml
@@ -4,7 +4,9 @@ roles:
  openstack:
  - volumes: # attached to each instance
      count: 4
-    size: 10 # GB
+    size: 30 # GB
+- machine:
+    disk: 200 # GB
  log-rotate:
    ceph-mds: 10G
    ceph-osd: 10G
diff --git a/ceph/qa/run-standalone.sh b/ceph/qa/run-standalone.sh

index 9321cba6515016696afaf22a4d60583ff09ab459..2c7ceaa3414e1c605d7c7fa225908517d3d58fa3 100755 (executable)
--- a/ceph/qa/run-standalone.sh
+++ b/ceph/qa/run-standalone.sh
@@ -6,7 +6,8 @@ if [ ! -e Makefile -o ! -d bin ]; then
      exit 1
  fi
  
-if [ ! -d /tmp/ceph-disk-virtualenv -o ! -d /tmp/ceph-detect-init-virtualenv ]; then
+TEMP_DIR=${TMPDIR:-/tmp}
+if [ ! -d $TEMP_DIR/ceph-disk-virtualenv -o ! -d $TEMP_DIR/ceph-detect-init-virtualenv ]; then
      echo '/tmp/*-virtualenv directories not built. Please run "make check" first.'
      exit 1
  fi
diff --git a/ceph/qa/standalone/ceph-helpers.sh b/ceph/qa/standalone/ceph-helpers.sh

index f12f0698a7624255d027d840d0e3662aec15d4f0..3883a6f58d48bec49e9303c15561fb36b6c4cff0 100755 (executable)
--- a/ceph/qa/standalone/ceph-helpers.sh
+++ b/ceph/qa/standalone/ceph-helpers.sh
@@ -19,7 +19,9 @@
  #
  TIMEOUT=300
  PG_NUM=4
-: ${CEPH_BUILD_VIRTUALENV:=/tmp}
+TMPDIR=${TMPDIR:-/tmp}
+CEPH_BUILD_VIRTUALENV=${TMPDIR}
+TESTDIR=${TESTDIR:-${TMPDIR}}
  
  if type xmlstarlet > /dev/null 2>&1; then
      XMLSTARLET=xmlstarlet
@@ -32,10 +34,12 @@ fi
  
  if [ `uname` = FreeBSD ]; then
      SED=gsed
+    AWK=gawk
      DIFFCOLOPTS=""
      KERNCORE="kern.corefile"
  else
      SED=sed
+    AWK=awk
      termwidth=$(stty -a | head -1 | sed -e 's/.*columns \([0-9]*\).*/\1/')
      if [ -n "$termwidth" -a "$termwidth" != "0" ]; then
          termwidth="-W ${termwidth}"
@@ -202,8 +206,8 @@ function teardown() {
  
  function __teardown_btrfs() {
      local btrfs_base_dir=$1
-    local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
-    local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
+    local btrfs_root=$(df -P . | tail -1 | $AWK '{print $NF}')
+    local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list -t . | $AWK '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
      for subvolume in $btrfs_dirs; do
         sudo btrfs subvolume delete $btrfs_root/$subvolume
      done
@@ -1350,7 +1354,7 @@ function test_is_clean() {
  
  #######################################################################
  
-calc() { awk "BEGIN{print $*}"; }
+calc() { $AWK "BEGIN{print $*}"; }
  
  ##
  # Return a list of numbers that are increasingly larger and whose
@@ -1757,7 +1761,7 @@ function run_in_background() {
      local pid_variable=$1
      shift
      # Execute the command and prepend the output with its pid
-    # We enforce to return the exit status of the command and not the awk one.
+    # We enforce to return the exit status of the command and not the sed one.
      ("$@" |& sed 's/^/'$$': /'; return "${PIPESTATUS[0]}") >&2 &
      eval "$pid_variable+=\" $!\""
  }
diff --git a/ceph/qa/standalone/scrub/osd-scrub-repair.sh b/ceph/qa/standalone/scrub/osd-scrub-repair.sh

index a266aed901fe5e630dba018c7cba790b93a5c5de..b6d541bb3b95b9bc5d191aa5483b7493618c64e0 100755 (executable)
--- a/ceph/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/ceph/qa/standalone/scrub/osd-scrub-repair.sh
@@ -5565,6 +5565,67 @@ EOF
      teardown $dir || return 1
  }
  
+function TEST_request_scrub_priority() {
+    local dir=$1
+    local poolname=psr_pool
+    local objname=POBJ
+    local OBJECTS=64
+    local PGS=8
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0"
+    run_osd $dir 0 $ceph_osd_args || return 1
+
+    create_pool $poolname $PGS $PGS || return 1
+    wait_for_clean || return 1
+
+    local osd=0
+    add_something $dir $poolname $objname noscrub || return 1
+    local primary=$(get_primary $poolname $objname)
+    local pg=$(get_pg $poolname $objname)
+    poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
+
+    local otherpgs
+    for i in $(seq 0 $(expr $PGS - 1))
+    do
+        opg="${poolid}.${i}"
+        if [ "$opg" = "$pg" ]; then
+          continue
+        fi
+        otherpgs="${otherpgs}${opg} "
+        local other_last_scrub=$(get_last_scrub_stamp $pg)
+        # Fake a schedule scrub
+        CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
+             trigger_scrub $opg || return 1
+    done
+
+    sleep 15
+    flush_pg_stats
+
+    # Request a regular scrub and it will be done
+    local last_scrub=$(get_last_scrub_stamp $pg)
+    ceph pg scrub $pg
+
+    ceph osd unset noscrub || return 1
+    ceph osd unset nodeep-scrub || return 1
+
+    wait_for_scrub $pg "$last_scrub"
+
+    for opg in $otherpgs $pg
+    do
+        wait_for_scrub $opg "$other_last_scrub"
+    done
+
+    # Verify that the requested scrub ran first
+    grep "log_channel.*scrub ok" $dir/osd.${primary}.log | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
+
+    return 0
+}
+
+
  main osd-scrub-repair "$@"
  
  # Local Variables:
diff --git a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml

index 5ca4bd6093f314e6d276bf35da05567962a10d43..750fa6dd70cea499d844d6df2a1c052cd7840082 100644 (file)
--- a/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
+++ b/ceph/qa/suites/ceph-ansible/smoke/basic/2-ceph/ceph_ansible.yaml
@@ -3,8 +3,8 @@ meta:
  
  overrides:
     ceph_ansible:
+     branch: stable-3.2
       vars:
-        branch: stable-3.2
          ceph_conf_overrides:
            global:
              osd default pool size: 2
diff --git a/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml

index a1e2ada19e995b5916b8f63edf31a05ba55862c5..37e315f7e3a6377738c72ed3a68ef36f63695802 100644 (file)
--- a/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/32bits/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
          fuse default permissions: false
  tasks:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml b/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml

new file mode 120000 (symlink)

index 0000000..e5444ae
--- /dev/null
+++ b/ceph/qa/suites/fs/basic_functional/clusters/1-mds-4-client-coloc.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-4-client-coloc.yaml
+\ No newline at end of file
diff --git a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml b/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml

deleted file mode 100644 (file)

index 1c540a4..0000000
--- a/ceph/qa/suites/fs/basic_functional/clusters/4-remote-clients.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mds.a, mds.b, client.1, client.2, client.3]
-- [client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
-    count: 2
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml b/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml

index 3f4aac9e5dfb2b1a3a9657bf6b21441ae3e98450..9ae738f01f01d87bdff05ce55c96ff7a734b7437 100644 (file)
--- a/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml
+++ b/ceph/qa/suites/fs/basic_functional/tasks/damage.yaml
@@ -17,6 +17,8 @@ overrides:
        - Corrupt dentry
        - Scrub error on inode
        - Metadata damage detected
+      - MDS_READ_ONLY
+      - force file system read-only
  
  tasks:
    - cephfs_test_runner:
diff --git a/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml

index a1e2ada19e995b5916b8f63edf31a05ba55862c5..37e315f7e3a6377738c72ed3a68ef36f63695802 100644 (file)
--- a/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/basic_workload/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
          fuse default permissions: false
  tasks:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml b/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml

index 12047bd7ac667ea0c435ce98f0605a65f1b25705..5cd97a3ae51f255fc2b20d48ccd2366581f1baf5 100644 (file)
--- a/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
+++ b/ceph/qa/suites/fs/bugs/client_trim_caps/clusters/small-cluster.yaml
@@ -4,6 +4,8 @@ openstack:
  - volumes: # attached to each instance
      count: 2
      size: 10 # GB
+- machine:
+    disk: 100 # GB
  log-rotate:
    ceph-mds: 10G
    ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml b/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml

new file mode 120000 (symlink)

index 0000000..9f4f161
--- /dev/null
+++ b/ceph/qa/suites/fs/multiclient/clusters/1-mds-2-client.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-2-client.yaml
+\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml b/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml

new file mode 120000 (symlink)

index 0000000..6b25e07
--- /dev/null
+++ b/ceph/qa/suites/fs/multiclient/clusters/1-mds-3-client.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-3-client.yaml
+\ No newline at end of file
diff --git a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml

deleted file mode 100644 (file)

index a533af5..0000000
--- a/ceph/qa/suites/fs/multiclient/clusters/three_clients.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.2]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
-    count: 1
-    size: 10 # GB
-
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
-
diff --git a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml b/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml

deleted file mode 100644 (file)

index 00f3815..0000000
--- a/ceph/qa/suites/fs/multiclient/clusters/two_clients.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-roles:
-- [mon.a, mon.b, mon.c, mgr.x, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [client.1]
-- [client.0]
-
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 10 # GB
-
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
-
diff --git a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml b/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml

deleted file mode 100644 (file)

index 2ae772c..0000000
--- a/ceph/qa/suites/fs/multifs/clusters/2-remote-clients.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, mon.b, mds.a, mds.b, client.1]
-- [mds.c, mds.d, mon.c, client.0, osd.4, osd.5, osd.6, osd.7]
-openstack:
-- volumes: # attached to each instance
-    count: 2
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml

index 2dd8ac77955ad52d83add8b1607aab2aa1816f13..09be2667548f26b4ca657016192f0128746e6270 100644 (file)
--- a/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/permission/tasks/cfuse_workunit_suites_pjd.yaml
@@ -7,6 +7,7 @@ overrides:
          client acl type: posix_acl
  tasks:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml b/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml

new file mode 120000 (symlink)

index 0000000..d15ecfd
--- /dev/null
+++ b/ceph/qa/suites/fs/thrash/clusters/1-mds-1-client-coloc.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-mds-1-client-coloc.yaml
+\ No newline at end of file
diff --git a/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml b/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml

deleted file mode 100644 (file)

index d025248..0000000
--- a/ceph/qa/suites/fs/thrash/clusters/mds-1active-1standby.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-roles:
-- [mon.a, mon.c, osd.0, osd.1, osd.2, mds.b-s-a]
-- [mon.b, mgr.x, mds.a, osd.3, osd.4, osd.5, client.0]
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 10 # GB
-log-rotate:
-  ceph-mds: 10G
-  ceph-osd: 10G
diff --git a/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml b/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml

index adcebc0baac47157f8d3201b920177061f67e221..4dc0086e6748c8aeb94957222d49db67fc56e2c3 100644 (file)
--- a/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml
+++ b/ceph/qa/suites/fs/thrash/msgr-failures/osd-mds-delay.yaml
@@ -3,6 +3,6 @@ overrides:
      conf:
        global:
          ms inject socket failures: 2500
-        mds inject delay type: osd mds
+        ms inject delay type: osd mds
          ms inject delay probability: .005
          ms inject delay max: 1
diff --git a/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml

index a1e2ada19e995b5916b8f63edf31a05ba55862c5..37e315f7e3a6377738c72ed3a68ef36f63695802 100644 (file)
--- a/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/fs/thrash/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
          fuse default permissions: false
  tasks:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml b/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml

index 09abaeb6eecd18ad74ab9992f67ca84816537426..1f24a5506f2978652f939edbbe76dd6308b8b357 100644 (file)
--- a/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/kcephfs/cephfs/tasks/kclient_workunit_suites_pjd.yaml
@@ -1,6 +1,7 @@
  tasks:
  - kclient:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml b/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml

index 3f4aac9e5dfb2b1a3a9657bf6b21441ae3e98450..9ae738f01f01d87bdff05ce55c96ff7a734b7437 100644 (file)
--- a/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml
+++ b/ceph/qa/suites/kcephfs/recovery/tasks/damage.yaml
@@ -17,6 +17,8 @@ overrides:
        - Corrupt dentry
        - Scrub error on inode
        - Metadata damage detected
+      - MDS_READ_ONLY
+      - force file system read-only
  
  tasks:
    - cephfs_test_runner:
diff --git a/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml b/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml

index a1e2ada19e995b5916b8f63edf31a05ba55862c5..37e315f7e3a6377738c72ed3a68ef36f63695802 100644 (file)
--- a/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml
+++ b/ceph/qa/suites/multimds/basic/tasks/cfuse_workunit_suites_pjd.yaml
@@ -6,6 +6,7 @@ overrides:
          fuse default permissions: false
  tasks:
  - workunit:
+    timeout: 6h
      clients:
        all:
          - suites/pjd.sh
diff --git a/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml b/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml

new file mode 100644 (file)

index 0000000..0b0b95c
--- /dev/null
+++ b/ceph/qa/suites/rados/singleton/all/mon-config-key-caps.yaml
@@ -0,0 +1,17 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+tasks:
+- install:
+- ceph:
+    log-whitelist:
+    - overall HEALTH_
+    - \(AUTH_BAD_CAPS\)
+- workunit:
+    clients:
+      all:
+        - mon/test_config_key_caps.sh
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml

new file mode 100644 (file)

index 0000000..8b13789
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml

new file mode 100644 (file)

index 0000000..20cc101
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/parallel/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml

new file mode 100644 (file)

index 0000000..8b13789
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml

new file mode 100644 (file)

index 0000000..20cc101
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml

new file mode 100644 (file)

index 0000000..8b13789
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml

new file mode 100644 (file)

index 0000000..20cc101
--- /dev/null
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml

index 442dcf105a793c7e92983639c284c9eae5bd54fc..a73b87beb4c55df48b2f46df00afc11191030631 100644 (file)
--- a/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/2-partial-upgrade/firsthalf.yaml
@@ -10,3 +10,8 @@ tasks:
  - ceph.restart:
      daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
  - print: "**** done ceph.restart 1st half"
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit && exit 1 || true
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
diff --git a/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml

index 1d528cd5de7515caab48fc6ee10a45d42e517ab2..faea6fdbf7ec62ea176a588abf28a07794bfc95d 100644 (file)
--- a/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml
+++ b/ceph/qa/suites/upgrade/jewel-x/stress-split/5-finish-upgrade.yaml
@@ -6,4 +6,14 @@ tasks:
      daemons: [osd.3, osd.4, osd.5]
      wait-for-healthy: false
      wait-for-osds-up: true
+- exec:
+     osd.0:
+      - ceph osd require-osd-release luminous
+- print: "**** done `ceph osd require-osd-release luminous`"
+- exec:
+    osd.0:
+      - ceph osd dump --format=json-pretty | grep "flags"
+      - ceph osd set pglog_hardlimit
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
  
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/% b/ceph/qa/suites/upgrade/luminous-p2p/%

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/.qa b/ceph/qa/suites/upgrade/luminous-p2p/.qa

deleted file mode 120000 (symlink)

index a602a03..0000000
--- a/ceph/qa/suites/upgrade/luminous-p2p/.qa
+++ /dev/null
@@ -1 +0,0 @@
-../.qa/
-\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/% b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/%

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa

new file mode 120000 (symlink)

index 0000000..a23f7e0
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/.qa
@@ -0,0 +1 @@
+../../.qa
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml

new file mode 100644 (file)

index 0000000..c0b6ebb
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/point-to-point-upgrade.yaml
@@ -0,0 +1,226 @@
+meta:
+- desc: |
+   Run ceph on two nodes, using one of them as a client,
+   with a separate client-only node. 
+   Use xfs beneath the osds.
+   install ceph/luminous v12.2.2 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.5 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.7 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.8 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.9 point version
+   run workload and upgrade-sequence in parallel
+   install ceph/luminous v12.2.10 point version
+   run workload and upgrade-sequence in parallel
+
+   install ceph/luminous latest version
+   run workload and upgrade-sequence in parallel
+overrides:
+  ceph:
+    log-whitelist:
+    - reached quota
+    - scrub
+    - osd_map_max_advance
+    - wrongly marked
+    - FS_DEGRADED
+    - POOL_APP_NOT_ENABLED
+    - CACHE_POOL_NO_HIT_SET
+    - POOL_FULL
+    - SMALLER_PG
+    - pool\(s\) full
+    - OSD_DOWN
+    - missing hit_sets
+    - CACHE_POOL_NEAR_FULL
+    - PG_AVAILABILITY
+    - PG_DEGRADED
+    - application not enabled
+    - overall HEALTH_
+    fs: xfs
+    conf:
+      mon:
+        mon debug unsafe allow tier with nonempty snaps: true
+        mon warn on pool no app: false
+      osd:
+        osd map max advance: 1000
+        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
+                              replica_log rgw sdk statelog timeindex user version"
+        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
+                                 replica_log rgw sdk statelog timeindex user version"
+      client:
+        rgw_crypt_require_ssl: false
+        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
+roles:
+- - mon.a
+  - mds.a
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+- - mon.b
+  - mon.c
+  - osd.3
+  - osd.4
+  - osd.5
+  - client.0
+- - client.1
+openstack:
+- volumes: # attached to each instance
+    count: 3
+    size: 30 # GB
+tasks:
+- print: "****  v12.2.2 about to install"
+- install:
+    tag: v12.2.2
+    # line below can be removed its from jewel test
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
+- print: "**** done v12.2.2 install"
+- ceph:
+   fs: xfs
+   add_osds_to_crush: true
+- print: "**** done ceph xfs"
+- sequential:
+   - workload
+- print: "**** done workload v12.2.2"
+
+####  upgrade to v12.2.5
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.5
+    mon.b:
+      tag: v12.2.5
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.5"
+
+####  upgrade to v12.2.7
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.7
+    mon.b:
+      tag: v12.2.7
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.7"
+
+####  upgrade to v12.2.8
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.8
+    mon.b:
+      tag: v12.2.8
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.8"
+
+
+####  upgrade to v12.2.9
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.9
+    mon.b:
+      tag: v12.2.9
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.9"
+
+####  upgrade to v12.2.10
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+      tag: v12.2.10
+    mon.b:
+      tag: v12.2.10
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous v12.2.10"
+
+
+####  upgrade to latest luminous
+- install.upgrade:
+    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
+    mon.a:
+    mon.b:
+    # Note that client.a IS NOT upgraded at this point
+- parallel:
+   - workload_luminous
+   - upgrade-sequence_luminous
+- print: "**** done parallel luminous branch"
+
+#######################
+workload:
+   sequential:
+   - workunit:
+       clients:
+         client.0:
+           - suites/blogbench.sh
+workload_luminous:
+   full_sequential:
+   - workunit:
+       tag: v12.2.2
+       clients:
+         client.1:
+         - rados/test.sh
+         - cls
+       env:
+         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
+   - print: "**** done rados/test.sh &  cls workload_luminous"
+   - sequential:
+     - rgw: [client.0]
+     - print: "**** done rgw workload_luminous"
+     - s3tests:
+         client.0:
+           force-branch: ceph-luminous
+           rgw_server: client.0
+           scan_for_encryption_keys: false
+     - print: "**** done s3tests workload_luminous"
+upgrade-sequence_luminous:
+   sequential:
+   - print: "**** done branch: luminous install.upgrade"
+   - ceph.restart: [mds.a]
+   - sleep:
+       duration: 60
+   - ceph.restart: [osd.0]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.1]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.2]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.3]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.4]
+   - sleep:
+       duration: 30
+   - ceph.restart: [osd.5]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.a]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.b]
+   - sleep:
+       duration: 60
+   - ceph.restart: [mon.c]
+   - sleep:
+       duration: 60
+   - print: "**** done ceph.restart all luminous branch mds/osd/mon"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported

new file mode 120000 (symlink)

index 0000000..79010c3
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-parallel/supported
@@ -0,0 +1 @@
+../../../../distros/supported
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/% b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/%

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/+

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml

new file mode 100644 (file)

index 0000000..a0d5c20
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/openstack.yaml
@@ -0,0 +1,6 @@
+openstack:
+  - machine:
+      disk: 100 # GB
+  - volumes: # attached to each instance
+      count: 3
+      size: 30 # GB
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml

new file mode 100644 (file)

index 0000000..4f40219
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/0-cluster/start.yaml
@@ -0,0 +1,20 @@
+meta:
+- desc: |
+   Run ceph on two nodes,
+   with a separate client-only node.
+   Use xfs beneath the osds.
+overrides:
+  ceph:
+    fs: xfs
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+- - osd.3
+  - osd.4
+  - osd.5
+- - client.0
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml

new file mode 100644 (file)

index 0000000..b66e0ca
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1-ceph-install/luminous.yaml
@@ -0,0 +1,19 @@
+meta:
+- desc: install ceph/luminous latest
+tasks:
+- install:
+    tag: v12.2.10
+    exclude_packages: ['librados3']
+    extra_packages: ['librados2']
+- print: "**** done install luminous v12.2.10"
+- ceph:
+- exec:
+    osd.0:
+      - ceph osd require-osd-release luminous
+      - ceph osd set-require-min-compat-client luminous
+- print: "**** done ceph"
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml

new file mode 100644 (file)

index 0000000..8b13789
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/normal_pg_log.yaml
@@ -0,0 +1 @@
+
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml

new file mode 100644 (file)

index 0000000..20cc101
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/1.1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_min_pg_log_entries: 1
+        osd_max_pg_log_entries: 2
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml

new file mode 100644 (file)

index 0000000..a73b87b
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/2-partial-upgrade/firsthalf.yaml
@@ -0,0 +1,17 @@
+meta:
+- desc: |
+   install upgrade ceph/-x on one node only
+   1st half
+   restart : osd.0,1,2
+tasks:
+- install.upgrade:
+    osd.0:
+- print: "**** done install.upgrade osd.0"
+- ceph.restart:
+    daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
+- print: "**** done ceph.restart 1st half"
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit && exit 1 || true
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit, should not succeed"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml

new file mode 100644 (file)

index 0000000..b3fddef
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/3-thrash/default.yaml
@@ -0,0 +1,25 @@
+meta:
+- desc: |
+   randomly kill and revive osd
+   small chance to increase the number of pgs
+overrides:
+  ceph:
+    log-whitelist:
+    - but it is still running
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - log bound mismatch
+tasks:
+- parallel:
+  - stress-tasks
+stress-tasks:
+- thrashosds:
+    timeout: 1200
+    chance_pgnum_grow: 1
+    chance_pgpnum_fix: 1
+    chance_thrash_cluster_full: 0
+    chance_thrash_pg_upmap: 0
+    chance_thrash_pg_upmap_items: 0
+    disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
+- print: "**** done thrashosds 3-thrash"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/+

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml

new file mode 100644 (file)

index 0000000..626ae8e
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/radosbench.yaml
@@ -0,0 +1,40 @@
+meta:
+- desc: |
+   run randomized correctness test for rados operations
+   generate write load with rados bench
+stress-tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+  - radosbench:
+      clients: [client.0]
+      time: 150
+- print: "**** done radosbench 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml

new file mode 100644 (file)

index 0000000..f8cc4d8
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-cls.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   run basic cls tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - cls/test_cls_rbd.sh
+- print: "**** done cls/test_cls_rbd.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml

new file mode 100644 (file)

index 0000000..30a677a
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd-import-export.yaml
@@ -0,0 +1,12 @@
+meta:
+- desc: |
+   run basic import/export cli tests for rbd
+stress-tasks:
+- workunit:
+    branch: luminous
+    clients:
+      client.0:
+        - rbd/import_export.sh
+    env:
+      RBD_CREATE_ARGS: --new-format
+- print: "**** done rbd/import_export.sh 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml

new file mode 100644 (file)

index 0000000..9079aa3
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/rbd_api.yaml
@@ -0,0 +1,10 @@
+meta:
+- desc: |
+   librbd C and C++ api tests
+stress-tasks:
+- workunit:
+     branch: luminous
+     clients:
+        client.0:
+           - rbd/test_librbd.sh
+- print: "**** done rbd/test_librbd.sh 7-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml

new file mode 100644 (file)

index 0000000..41e34d6
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/readwrite.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool,
+   using only reads, writes, and deletes
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 500
+      write_append_excl: false
+      op_weights:
+        read: 45
+        write: 45
+        delete: 10
+- print: "**** done rados/readwrite 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml

new file mode 100644 (file)

index 0000000..f56d0de
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/4-workload/snaps-few-objects.yaml
@@ -0,0 +1,18 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+stress-tasks:
+- full_sequential:
+  - rados:
+      clients: [client.0]
+      ops: 4000
+      objects: 50
+      write_append_excl: false
+      op_weights:
+        read: 100
+        write: 100
+        delete: 50
+        snap_create: 50
+        snap_remove: 50
+        rollback: 50
+- print: "**** done rados/snaps-few-objects 5-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml

new file mode 100644 (file)

index 0000000..9d5a96c
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/5-finish-upgrade.yaml
@@ -0,0 +1,14 @@
+tasks:
+- install.upgrade:
+    osd.3:
+    client.0:
+- ceph.restart:
+    daemons: [osd.3, osd.4, osd.5]
+    wait-for-healthy: false
+    wait-for-osds-up: true
+- exec:
+    osd.0:
+      - ceph osd set pglog_hardlimit
+      - ceph osd dump --format=json-pretty | grep "flags"
+- print: "**** try to set pglog_hardlimit again, should succeed"
+
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/+ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/+

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml

new file mode 100644 (file)

index 0000000..56ba21d
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rbd-python.yaml
@@ -0,0 +1,9 @@
+meta:
+- desc: |
+   librbd python api tests
+tasks:
+- workunit:
+    clients:
+      client.0:
+        - rbd/test_librbd_python.sh
+- print: "**** done rbd/test_librbd_python.sh 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml

new file mode 100644 (file)

index 0000000..76e5d6f
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/rgw-swift.yaml
@@ -0,0 +1,11 @@
+meta:
+- desc: |
+   swift api tests for rgw
+tasks:
+- rgw:
+    client.0:
+- print: "**** done rgw 9-workload"
+- swift:
+    client.0:
+      rgw_server: client.0
+- print: "**** done swift 9-workload"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml

new file mode 100644 (file)

index 0000000..805bf97
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/7-final-workload/snaps-many-objects.yaml
@@ -0,0 +1,16 @@
+meta:
+- desc: |
+   randomized correctness test for rados operations on a replicated pool with snapshot operations
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_append_excl: false
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 50
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported

new file mode 120000 (symlink)

index 0000000..79010c3
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/supported
@@ -0,0 +1 @@
+../../../../distros/supported
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml

new file mode 120000 (symlink)

index 0000000..e0426db
--- /dev/null
+++ b/ceph/qa/suites/upgrade/luminous-p2p/luminous-p2p-stress-split/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../../tasks/thrashosds-health.yaml
+\ No newline at end of file
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml

deleted file mode 100644 (file)

index 9deeb4c..0000000
--- a/ceph/qa/suites/upgrade/luminous-p2p/point-to-point-upgrade.yaml
+++ /dev/null
@@ -1,193 +0,0 @@
-meta:
-- desc: |
-   Run ceph on two nodes, using one of them as a client,
-   with a separate client-only node. 
-   Use xfs beneath the osds.
-   install ceph/luminous v12.2.2 point version
-   run workload and upgrade-sequence in parallel
-   install ceph/luminous v12.2.5 point version
-   run workload and upgrade-sequence in parallel
-   install ceph/luminous v12.2.7 point version
-   run workload and upgrade-sequence in parallel
-   install ceph/luminous v12.2.8 point version
-   run workload and upgrade-sequence in parallel
-   install ceph/luminous latest version
-   run workload and upgrade-sequence in parallel
-overrides:
-  ceph:
-    log-whitelist:
-    - reached quota
-    - scrub
-    - osd_map_max_advance
-    - wrongly marked
-    - FS_DEGRADED
-    - POOL_APP_NOT_ENABLED
-    - CACHE_POOL_NO_HIT_SET
-    - POOL_FULL
-    - SMALLER_PG
-    - pool\(s\) full
-    - OSD_DOWN
-    - missing hit_sets
-    - CACHE_POOL_NEAR_FULL
-    - PG_AVAILABILITY
-    - PG_DEGRADED
-    - application not enabled
-    - overall HEALTH_
-    fs: xfs
-    conf:
-      mon:
-        mon debug unsafe allow tier with nonempty snaps: true
-        mon warn on pool no app: false
-      osd:
-        osd map max advance: 1000
-        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
-                              replica_log rgw sdk statelog timeindex user version"
-        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
-                                 replica_log rgw sdk statelog timeindex user version"
-      client:
-        rgw_crypt_require_ssl: false
-        rgw crypt s3 kms encryption keys: testkey-1=YmluCmJvb3N0CmJvb3N0LWJ1aWxkCmNlcGguY29uZgo= testkey-2=aWIKTWFrZWZpbGUKbWFuCm91dApzcmMKVGVzdGluZwo=
-roles:
-- - mon.a
-  - mds.a
-  - osd.0
-  - osd.1
-  - osd.2
-  - mgr.x
-- - mon.b
-  - mon.c
-  - osd.3
-  - osd.4
-  - osd.5
-  - client.0
-- - client.1
-openstack:
-- volumes: # attached to each instance
-    count: 3
-    size: 30 # GB
-tasks:
-- print: "****  v12.2.2 about to install"
-- install:
-    tag: v12.2.2
-    # line below can be removed its from jewel test
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev', 'librgw2']
-- print: "**** done v12.2.2 install"
-- ceph:
-   fs: xfs
-   add_osds_to_crush: true
-- print: "**** done ceph xfs"
-- sequential:
-   - workload
-- print: "**** done workload v12.2.2"
-
-####  upgrade to v12.2.5
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    mon.a:
-      tag: v12.2.5
-    mon.b:
-      tag: v12.2.5
-    # Note that client.a IS NOT upgraded at this point
-- parallel:
-   - workload_luminous
-   - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.5"
-
-####  upgrade to v12.2.7
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    mon.a:
-      tag: v12.2.7
-    mon.b:
-      tag: v12.2.7
-    # Note that client.a IS NOT upgraded at this point
-- parallel:
-   - workload_luminous
-   - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.7"
-
-####  upgrade to v12.2.8
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    mon.a:
-      tag: v12.2.8
-    mon.b:
-      tag: v12.2.8
-    # Note that client.a IS NOT upgraded at this point
-- parallel:
-   - workload_luminous
-   - upgrade-sequence_luminous
-- print: "**** done parallel luminous v12.2.8"
-
-####  upgrade to latest luminous
-- install.upgrade:
-    #exclude_packages: ['ceph-mgr','libcephfs2','libcephfs-devel','libcephfs-dev']
-    mon.a:
-    mon.b:
-    # Note that client.a IS NOT upgraded at this point
-- parallel:
-   - workload_luminous
-   - upgrade-sequence_luminous
-- print: "**** done parallel luminous branch"
-
-#######################
-workload:
-   sequential:
-   - workunit:
-       clients:
-         client.0:
-           - suites/blogbench.sh
-workload_luminous:
-   full_sequential:
-   - workunit:
-       tag: v12.2.2
-       clients:
-         client.1:
-         - rados/test.sh
-         - cls
-       env:
-         CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
-   - print: "**** done rados/test.sh &  cls workload_luminous"
-   - sequential:
-     - rgw: [client.0]
-     - print: "**** done rgw workload_luminous"
-     - s3tests:
-         client.0:
-           force-branch: ceph-luminous
-           rgw_server: client.0
-           scan_for_encryption_keys: false
-     - print: "**** done s3tests workload_luminous"
-upgrade-sequence_luminous:
-   sequential:
-   - print: "**** done branch: luminous install.upgrade"
-   - ceph.restart: [mds.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [osd.0]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.1]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.2]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.3]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.4]
-   - sleep:
-       duration: 30
-   - ceph.restart: [osd.5]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.a]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.b]
-   - sleep:
-       duration: 60
-   - ceph.restart: [mon.c]
-   - sleep:
-       duration: 60
-   - print: "**** done ceph.restart all luminous branch mds/osd/mon"
diff --git a/ceph/qa/suites/upgrade/luminous-p2p/supported b/ceph/qa/suites/upgrade/luminous-p2p/supported

deleted file mode 120000 (symlink)

index dd0d7f1..0000000
--- a/ceph/qa/suites/upgrade/luminous-p2p/supported
+++ /dev/null
@@ -1 +0,0 @@
-../../../distros/supported/
-\ No newline at end of file
diff --git a/ceph/qa/tasks/cephfs/filesystem.py b/ceph/qa/tasks/cephfs/filesystem.py

index 7f9253aabc29adfbf49204bea040e31f977dc2b4..e030012252151a4e7aee268f265cc972d0c2fe13 100644 (file)
--- a/ceph/qa/tasks/cephfs/filesystem.py
+++ b/ceph/qa/tasks/cephfs/filesystem.py
@@ -171,8 +171,10 @@ class CephCluster(object):
          del self._ctx.ceph['ceph'].conf[subsys][key]
          write_conf(self._ctx)
  
-    def json_asok(self, command, service_type, service_id):
-        proc = self.mon_manager.admin_socket(service_type, service_id, command)
+    def json_asok(self, command, service_type, service_id, timeout=None):
+        if timeout is None:
+            timeout = 15*60
+        proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout)
          response_data = proc.stdout.getvalue()
          log.info("_json_asok output: {0}".format(response_data))
          if response_data.strip():
@@ -444,10 +446,10 @@ class Filesystem(MDSCluster):
          self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
  
      def set_max_mds(self, max_mds):
-        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
+        self.set_var("max_mds", "%d" % max_mds)
  
      def set_allow_dirfrags(self, yes):
-        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
+        self.set_var("allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
  
      def get_pgs_per_fs_pool(self):
          """
@@ -559,8 +561,10 @@ class Filesystem(MDSCluster):
      def _df(self):
          return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
  
-    def get_mds_map(self):
-        return self.status().get_fsmap(self.id)['mdsmap']
+    def get_mds_map(self, status=None):
+        if status is None:
+            status = self.status()
+        return status.get_fsmap(self.id)['mdsmap']
  
      def get_var(self, var):
          return self.status().get_fsmap(self.id)['mdsmap'][var]
@@ -855,15 +859,15 @@ class Filesystem(MDSCluster):
  
          return version
  
-    def mds_asok(self, command, mds_id=None):
+    def mds_asok(self, command, mds_id=None, timeout=None):
          if mds_id is None:
              mds_id = self.get_lone_mds_id()
  
-        return self.json_asok(command, 'mds', mds_id)
+        return self.json_asok(command, 'mds', mds_id, timeout=timeout)
  
-    def rank_asok(self, command, rank=0):
-        info = self.get_rank(rank=rank)
-        return self.json_asok(command, 'mds', info['name'])
+    def rank_asok(self, command, rank=0, status=None, timeout=None):
+        info = self.get_rank(rank=rank, status=status)
+        return self.json_asok(command, 'mds', info['name'], timeout=timeout)
  
      def read_cache(self, path, depth=None):
          cmd = ["dump", "tree", path]
@@ -893,9 +897,17 @@ class Filesystem(MDSCluster):
          while True:
              status = self.status()
              if rank is not None:
-                mds_info = status.get_rank(self.id, rank)
-                current_state = mds_info['state'] if mds_info else None
-                log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                try:
+                    mds_info = status.get_rank(self.id, rank)
+                    current_state = mds_info['state'] if mds_info else None
+                    log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+                except:
+                    mdsmap = self.get_mds_map(status=status)
+                    if rank in mdsmap['failed']:
+                        log.info("Waiting for rank {0} to come back.".format(rank))
+                        current_state = None
+                    else:
+                        raise
              elif mds_id is not None:
                  # mds_info is None if no daemon with this ID exists in the map
                  mds_info = status.get_mds(mds_id)
@@ -1166,6 +1178,9 @@ class Filesystem(MDSCluster):
          """
          return ""
  
+    def _make_rank(self, rank):
+        return "{}:{}".format(self.name, rank)
+
      def _run_tool(self, tool, args, rank=None, quiet=False):
          # Tests frequently have [client] configuration that jacks up
          # the objecter log level (unlikely to be interesting here)
@@ -1176,7 +1191,7 @@ class Filesystem(MDSCluster):
              base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
  
          if rank is not None:
-            base_args.extend(["--rank", "%d" % rank])
+            base_args.extend(["--rank", "%s" % str(rank)])
  
          t1 = datetime.datetime.now()
          r = self.tool_remote.run(
@@ -1198,11 +1213,12 @@ class Filesystem(MDSCluster):
          mds_id = self.mds_ids[0]
          return self.mds_daemons[mds_id].remote
  
-    def journal_tool(self, args, rank=None, quiet=False):
+    def journal_tool(self, args, rank, quiet=False):
          """
-        Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+        Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout
          """
-        return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+        fs_rank = self._make_rank(rank)
+        return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet)
  
      def table_tool(self, args, quiet=False):
          """
diff --git a/ceph/qa/tasks/cephfs/fuse_mount.py b/ceph/qa/tasks/cephfs/fuse_mount.py

index b121680b0b39040ff31044c5f7a8f6776621222c..33bcf8c60419df19d8c209ad397713ec040f6cb3 100644 (file)
--- a/ceph/qa/tasks/cephfs/fuse_mount.py
+++ b/ceph/qa/tasks/cephfs/fuse_mount.py
@@ -50,6 +50,7 @@ class FuseMount(CephFSMount):
                  '--',
                  self.mountpoint,
              ],
+            timeout=(15*60)
          )
  
          run_cmd = [
@@ -88,12 +89,14 @@ class FuseMount(CephFSMount):
          def list_connections():
              self.client_remote.run(
                  args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
-                check_status=False
+                check_status=False,
+                timeout=(15*60)
              )
              p = self.client_remote.run(
                  args=["ls", "/sys/fs/fuse/connections"],
                  stdout=StringIO(),
-                check_status=False
+                check_status=False,
+                timeout=(15*60)
              )
              if p.exitstatus != 0:
                  return []
@@ -163,7 +166,8 @@ class FuseMount(CephFSMount):
              ],
              stdout=StringIO(),
              stderr=StringIO(),
-            wait=False
+            wait=False,
+            timeout=(15*60)
          )
          try:
              proc.wait()
@@ -202,11 +206,18 @@ class FuseMount(CephFSMount):
  
          # Now that we're mounted, set permissions so that the rest of the test will have
          # unrestricted access to the filesystem mount.
-        self.client_remote.run(
-            args=['sudo', 'chmod', '1777', self.mountpoint])
+        try:
+            stderr = StringIO()
+            self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), stderr=stderr)
+        except run.CommandFailedError:
+            stderr = stderr.getvalue()
+            if "Read-only file system".lower() in stderr.lower():
+                pass
+            else:
+                raise
  
      def _mountpoint_exists(self):
-        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
+        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, timeout=(15*60)).exitstatus == 0
  
      def umount(self):
          try:
@@ -218,6 +229,7 @@ class FuseMount(CephFSMount):
                      '-u',
                      self.mountpoint,
                  ],
+                timeout=(30*60),
              )
          except run.CommandFailedError:
              log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
@@ -229,7 +241,7 @@ class FuseMount(CephFSMount):
                  run.Raw(';'),
                  'ps',
                  'auxf',
-            ])
+            ], timeout=(60*15))
  
              # abort the fuse mount, killing all hung processes
              if self._fuse_conn:
@@ -252,7 +264,8 @@ class FuseMount(CephFSMount):
                          '-f',
                          self.mountpoint,
                      ],
-                    stderr=stderr
+                    stderr=stderr,
+                    timeout=(60*15)
                  )
              except CommandFailedError:
                  if self.is_mounted():
@@ -307,7 +320,8 @@ class FuseMount(CephFSMount):
                      '--',
                      self.mountpoint,
                  ],
-                stderr=stderr
+                stderr=stderr,
+                timeout=(60*5)
              )
          except CommandFailedError:
              if "No such file or directory" in stderr.getvalue():
@@ -354,6 +368,7 @@ class FuseMount(CephFSMount):
                  '-rf',
                  self.mountpoint,
              ],
+            timeout=(60*5)
          )
  
      def _asok_path(self):
@@ -392,15 +407,15 @@ print find_socket("{client_name}")
  
          # Find the admin socket
          p = self.client_remote.run(args=[
-            'python', '-c', pyscript
-        ], stdout=StringIO())
+            'sudo', 'python2', '-c', pyscript
+        ], stdout=StringIO(), timeout=(15*60))
          asok_path = p.stdout.getvalue().strip()
          log.info("Found client admin socket at {0}".format(asok_path))
  
          # Query client ID from admin socket
          p = self.client_remote.run(
              args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
-            stdout=StringIO())
+            stdout=StringIO(), timeout=(15*60))
          return json.loads(p.stdout.getvalue())
  
      def get_global_id(self):
diff --git a/ceph/qa/tasks/cephfs/kernel_mount.py b/ceph/qa/tasks/cephfs/kernel_mount.py

index 80271a6ebbb9ea6a7c6785c8817eafcab718a788..4fdbd1b0c5f94d3f7387a7f54d7f4e547e729bd4 100644 (file)
--- a/ceph/qa/tasks/cephfs/kernel_mount.py
+++ b/ceph/qa/tasks/cephfs/kernel_mount.py
@@ -43,6 +43,7 @@ class KernelMount(CephFSMount):
                  run.Raw('>'),
                  filename,
              ],
+            timeout=(5*60),
          )
  
      def mount(self, mount_path=None, mount_fs_name=None):
@@ -60,6 +61,7 @@ class KernelMount(CephFSMount):
                  '--',
                  self.mountpoint,
              ],
+            timeout=(5*60),
          )
  
          if mount_path is None:
@@ -84,10 +86,11 @@ class KernelMount(CephFSMount):
                  '-o',
                  opts
              ],
+            timeout=(30*60),
          )
  
          self.client_remote.run(
-            args=['sudo', 'chmod', '1777', self.mountpoint])
+            args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(5*60))
  
          self.mounted = True
  
@@ -99,7 +102,7 @@ class KernelMount(CephFSMount):
              cmd.append('-f')
  
          try:
-            self.client_remote.run(args=cmd, timeout=(5*60))
+            self.client_remote.run(args=cmd, timeout=(15*60))
          except Exception as e:
              self.client_remote.run(args=[
                  'sudo',
@@ -107,7 +110,7 @@ class KernelMount(CephFSMount):
                  'lsof',
                  run.Raw(';'),
                  'ps', 'auxf',
-            ])
+            ], timeout=(15*60))
              raise e
  
          rproc = self.client_remote.run(
@@ -194,6 +197,7 @@ class KernelMount(CephFSMount):
                  '--',
                  self.mountpoint,
              ],
+            timeout=(5*60),
          )
  
      def _find_debug_dir(self):
@@ -219,7 +223,7 @@ class KernelMount(CephFSMount):
  
          p = self.client_remote.run(args=[
              'sudo', 'python', '-c', pyscript
-        ], stdout=StringIO())
+        ], stdout=StringIO(), timeout=(5*60))
          client_id_to_dir = json.loads(p.stdout.getvalue())
  
          try:
@@ -241,7 +245,7 @@ class KernelMount(CephFSMount):
  
          p = self.client_remote.run(args=[
              'sudo', 'python', '-c', pyscript
-        ], stdout=StringIO())
+        ], stdout=StringIO(), timeout=(5*60))
          return p.stdout.getvalue()
  
      def get_global_id(self):
diff --git a/ceph/qa/tasks/cephfs/test_client_limits.py b/ceph/qa/tasks/cephfs/test_client_limits.py

index b06d5123d8f2396685ab1e6316316c0ba5204fb0..1f1d5467079ccf60027e05d18e08a0fcabf6cf58 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_client_limits.py
+++ b/ceph/qa/tasks/cephfs/test_client_limits.py
@@ -134,10 +134,10 @@ class TestClientLimits(CephFSTestCase):
          # Client B tries to stat the file that client A created
          rproc = self.mount_b.write_background("file1")
  
-        # After mds_session_timeout, we should see a health warning (extra lag from
+        # After session_timeout, we should see a health warning (extra lag from
          # MDS beacon period)
-        mds_session_timeout = float(self.fs.get_config("mds_session_timeout"))
-        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_session_timeout + 10)
+        session_timeout = self.fs.get_var("session_timeout")
+        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10)
  
          # Client B should still be stuck
          self.assertFalse(rproc.finished)
diff --git a/ceph/qa/tasks/cephfs/test_client_recovery.py b/ceph/qa/tasks/cephfs/test_client_recovery.py

index 829ca3d5c6a8fc29f0dc234da5eaf8ee7023f0c3..2b91cbfe6a1a29e460aac11efd303560ffb0ce06 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_client_recovery.py
+++ b/ceph/qa/tasks/cephfs/test_client_recovery.py
@@ -30,10 +30,9 @@ class TestClientNetworkRecovery(CephFSTestCase):
      REQUIRE_ONE_CLIENT_REMOTE = True
      CLIENTS_REQUIRED = 2
  
-    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
  
      # Environment references
-    mds_session_timeout = None
      mds_reconnect_timeout = None
      ms_max_backoff = None
  
@@ -45,6 +44,8 @@ class TestClientNetworkRecovery(CephFSTestCase):
          I/O after failure.
          """
  
+        session_timeout = self.fs.get_var("session_timeout")
+
          # We only need one client
          self.mount_b.umount_wait()
  
@@ -67,7 +68,7 @@ class TestClientNetworkRecovery(CephFSTestCase):
          # ...then it should block
          self.assertFalse(write_blocked.finished)
          self.assert_session_state(client_id, "open")
-        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
          self.assertFalse(write_blocked.finished)
          self.assert_session_state(client_id, "stale")
  
@@ -87,10 +88,9 @@ class TestClientRecovery(CephFSTestCase):
      REQUIRE_KCLIENT_REMOTE = True
      CLIENTS_REQUIRED = 2
  
-    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+    LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"]
  
      # Environment references
-    mds_session_timeout = None
      mds_reconnect_timeout = None
      ms_max_backoff = None
  
@@ -214,6 +214,8 @@ class TestClientRecovery(CephFSTestCase):
          self.mount_a.create_destroy()
  
      def test_stale_caps(self):
+        session_timeout = self.fs.get_var("session_timeout")
+
          # Capability release from stale session
          # =====================================
          cap_holder = self.mount_a.open_background()
@@ -226,7 +228,7 @@ class TestClientRecovery(CephFSTestCase):
          self.mount_a.kill()
  
          try:
-            # Now, after mds_session_timeout seconds, the waiter should
+            # Now, after session_timeout seconds, the waiter should
              # complete their operation when the MDS marks the holder's
              # session stale.
              cap_waiter = self.mount_b.write_background()
@@ -239,9 +241,9 @@ class TestClientRecovery(CephFSTestCase):
  
              cap_waited = b - a
              log.info("cap_waiter waited {0}s".format(cap_waited))
-            self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+            self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0,
                              "Capability handover took {0}, expected approx {1}".format(
-                                cap_waited, self.mds_session_timeout
+                                cap_waited, session_timeout
                              ))
  
              cap_holder.stdin.close()
@@ -261,6 +263,8 @@ class TestClientRecovery(CephFSTestCase):
          # Eviction while holding a capability
          # ===================================
  
+        session_timeout = self.fs.get_var("session_timeout")
+
          # Take out a write capability on a file on client A,
          # and then immediately kill it.
          cap_holder = self.mount_a.open_background()
@@ -290,9 +294,9 @@ class TestClientRecovery(CephFSTestCase):
              log.info("cap_waiter waited {0}s".format(cap_waited))
              # This is the check that it happened 'now' rather than waiting
              # for the session timeout
-            self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+            self.assertLess(cap_waited, session_timeout / 2.0,
                              "Capability handover took {0}, expected less than {1}".format(
-                                cap_waited, self.mds_session_timeout / 2.0
+                                cap_waited, session_timeout / 2.0
                              ))
  
              cap_holder.stdin.close()
@@ -479,6 +483,8 @@ class TestClientRecovery(CephFSTestCase):
          if not isinstance(self.mount_a, FuseMount):
              raise SkipTest("Require FUSE client to handle signal STOP/CONT")
  
+        session_timeout = self.fs.get_var("session_timeout")
+
          self.mount_a.run_shell(["mkdir", "testdir"])
          self.mount_a.run_shell(["touch", "testdir/file1"])
          # populate readdir cache
@@ -497,7 +503,7 @@ class TestClientRecovery(CephFSTestCase):
          self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
  
          self.assert_session_state(mount_b_gid, "open")
-        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        time.sleep(session_timeout * 1.5)  # Long enough for MDS to consider session stale
          self.assert_session_state(mount_b_gid, "stale")
  
          self.mount_a.run_shell(["touch", "testdir/file2"])
diff --git a/ceph/qa/tasks/cephfs/test_damage.py b/ceph/qa/tasks/cephfs/test_damage.py

index 380b49c4b6584918a7bc70353f69ac8959ecde0f..01e9d5803b1b399001bac1dec84b3c186253b418 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_damage.py
+++ b/ceph/qa/tasks/cephfs/test_damage.py
@@ -12,6 +12,7 @@ DAMAGED_ON_START = "damaged_on_start"
  DAMAGED_ON_LS = "damaged_on_ls"
  CRASHED = "server crashed"
  NO_DAMAGE = "no damage"
+READONLY = "readonly"
  FAILED_CLIENT = "client failed"
  FAILED_SERVER = "server failed"
  
@@ -134,8 +135,8 @@ class TestDamage(CephFSTestCase):
          mutations = []
  
          # Removals
-        for obj_id in objects:
-            if obj_id in [
+        for o in objects:
+            if o in [
                  # JournalPointers are auto-replaced if missing (same path as upgrade)
                  "400.00000000",
                  # Missing dirfrags for non-system dirs result in empty directory
@@ -148,29 +149,37 @@ class TestDamage(CephFSTestCase):
                  expectation = DAMAGED_ON_START
  
              log.info("Expectation on rm '{0}' will be '{1}'".format(
-                obj_id, expectation
+                o, expectation
              ))
  
              mutations.append(MetadataMutation(
-                obj_id,
-                "Delete {0}".format(obj_id),
-                lambda o=obj_id: self.fs.rados(["rm", o]),
+                o,
+                "Delete {0}".format(o),
+                lambda o=o: self.fs.rados(["rm", o]),
                  expectation
              ))
  
          # Blatant corruptions
-        mutations.extend([
-            MetadataMutation(
-                o,
-                "Corrupt {0}".format(o),
-                lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
-                DAMAGED_ON_START
-            ) for o in data_objects
-        ])
-
-        # Truncations
          for obj_id in data_objects:
              if obj_id == "500.00000000":
+                # purge queue corruption results in read-only FS
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    READONLY
+                ))
+            else:
+                mutations.append(MetadataMutation(
+                    obj_id,
+                    "Corrupt {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                    DAMAGED_ON_START
+                ))
+
+        # Truncations
+        for o in data_objects:
+            if o == "500.00000000":
                  # The PurgeQueue is allowed to be empty: Journaler interprets
                  # an empty header object as an empty journal.
                  expectation = NO_DAMAGE
@@ -182,7 +191,7 @@ class TestDamage(CephFSTestCase):
                      o,
                      "Truncate {0}".format(o),
                      lambda o=o: self.fs.rados(["truncate", o, "0"]),
-                    DAMAGED_ON_START
+                    expectation
              ))
  
          # OMAP value corruptions
@@ -204,22 +213,22 @@ class TestDamage(CephFSTestCase):
              )
  
          # OMAP header corruptions
-        for obj_id in omap_header_objs:
-            if re.match("60.\.00000000", obj_id) \
-                    or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+        for o in omap_header_objs:
+            if re.match("60.\.00000000", o) \
+                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                  expectation = DAMAGED_ON_START
              else:
                  expectation = NO_DAMAGE
  
              log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
-                obj_id, expectation
+                o, expectation
              ))
  
              mutations.append(
                  MetadataMutation(
-                    obj_id,
-                    "Corrupt omap header on {0}".format(obj_id),
-                    lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
+                    o,
+                    "Corrupt omap header on {0}".format(o),
+                    lambda o=o: self.fs.rados(["setomapheader", o, junk]),
                      expectation
                  )
              )
@@ -314,7 +323,17 @@ class TestDamage(CephFSTestCase):
                      else:
                          log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                          results[mutation] = FAILED_SERVER
-
+            elif mutation.expectation == READONLY:
+                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
+                try:
+                    proc.wait()
+                except CommandFailedError:
+                    stderr = proc.stderr.getvalue()
+                    log.info(stderr)
+                    if "Read-only file system".lower() in stderr.lower():
+                        pass
+                    else:
+                        raise
              else:
                  try:
                      wait([proc], 20)
@@ -480,7 +499,7 @@ class TestDamage(CephFSTestCase):
  
          # Drop everything from the MDS cache
          self.mds_cluster.mds_stop()
-        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.journal_tool(['journal', 'reset'], 0)
          self.mds_cluster.mds_fail_restart()
          self.fs.wait_for_daemons()
  
diff --git a/ceph/qa/tasks/cephfs/test_data_scan.py b/ceph/qa/tasks/cephfs/test_data_scan.py

index a2d315768aa03b6863c38c7bcedf6bb595ae0b6e..1e774554188861cd5ea333505b0ddd1d146e640c 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_data_scan.py
+++ b/ceph/qa/tasks/cephfs/test_data_scan.py
@@ -362,9 +362,9 @@ class TestDataScan(CephFSTestCase):
          if False:
              with self.assertRaises(CommandFailedError):
                  # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"])
+                self.fs.journal_tool(["journal", "reset"], 0)
  
-        self.fs.journal_tool(["journal", "reset", "--force"])
+        self.fs.journal_tool(["journal", "reset", "--force"], 0)
          self.fs.data_scan(["init"])
          self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
          self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
diff --git a/ceph/qa/tasks/cephfs/test_flush.py b/ceph/qa/tasks/cephfs/test_flush.py

index 1f84e4200a399253e33be79ba303e50196383b5a..ee0b1c92b1992e64bae7cfff36a31f205dad890a 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_flush.py
+++ b/ceph/qa/tasks/cephfs/test_flush.py
@@ -44,7 +44,7 @@ class TestFlush(CephFSTestCase):
  
          # ...and the journal is truncated to just a single subtreemap from the
          # newly created segment
-        summary_output = self.fs.journal_tool(["event", "get", "summary"])
+        summary_output = self.fs.journal_tool(["event", "get", "summary"], 0)
          try:
              self.assertEqual(summary_output,
                               dedent(
@@ -72,7 +72,7 @@ class TestFlush(CephFSTestCase):
                               ).strip())
              flush_data = self.fs.mds_asok(["flush", "journal"])
              self.assertEqual(flush_data['return_code'], 0)
-            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
+            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0),
                               dedent(
                                   """
                                   Events by type:
diff --git a/ceph/qa/tasks/cephfs/test_forward_scrub.py b/ceph/qa/tasks/cephfs/test_forward_scrub.py

index 1f80366af0cf98ccf8b9cec365a9ba182f1df6b8..e165780f31f188641fd527474e7155482bc14c6a 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_forward_scrub.py
+++ b/ceph/qa/tasks/cephfs/test_forward_scrub.py
@@ -242,10 +242,10 @@ class TestForwardScrub(CephFSTestCase):
          # is all that will be in the InoTable in memory)
  
          self.fs.journal_tool(["event", "splice",
-            "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
+                              "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
  
          self.fs.journal_tool(["event", "splice",
-            "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
+                              "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
  
          # Revert to old inotable.
          for key, value in inotable_copy.iteritems():
diff --git a/ceph/qa/tasks/cephfs/test_fragment.py b/ceph/qa/tasks/cephfs/test_fragment.py

index a62ef743216471116c27b447f209d3f0370d8b59..54a49cea2fc6bccdea3ee158cfe83581a01fe30c 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_fragment.py
+++ b/ceph/qa/tasks/cephfs/test_fragment.py
@@ -33,7 +33,6 @@ class TestFragmentation(CephFSTestCase):
          Apply kwargs as MDS configuration settings, enable dirfrags
          and restart the MDSs.
          """
-        kwargs['mds_bal_frag'] = "true"
  
          for k, v in kwargs.items():
              self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
diff --git a/ceph/qa/tasks/cephfs/test_journal_migration.py b/ceph/qa/tasks/cephfs/test_journal_migration.py

index 64fe939804ee25ca847edd552cbbf7ce3351546b..5f956be93aaab857b6cd29f03ff12972bc1ca569 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_journal_migration.py
+++ b/ceph/qa/tasks/cephfs/test_journal_migration.py
@@ -82,13 +82,14 @@ class TestJournalMigration(CephFSTestCase):
              ))
  
          # Verify that cephfs-journal-tool can now read the rewritten journal
-        inspect_out = self.fs.journal_tool(["journal", "inspect"])
+        inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
          if not inspect_out.endswith(": OK"):
              raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
                  inspect_out
              ))
  
-        self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
+        self.fs.journal_tool(["event", "get", "json",
+                              "--path", "/tmp/journal.json"], 0)
          p = self.fs.tool_remote.run(
              args=[
                  "python",
diff --git a/ceph/qa/tasks/cephfs/test_journal_repair.py b/ceph/qa/tasks/cephfs/test_journal_repair.py

index 62cbbb0684a76a19b3fc4fbd75aa5d48c187f901..9832f91a10ac8b1b33e916f223a75d54d56327e0 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_journal_repair.py
+++ b/ceph/qa/tasks/cephfs/test_journal_repair.py
@@ -77,7 +77,7 @@ class TestJournalRepair(CephFSTestCase):
          self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
  
          # Execute the dentry recovery, this should populate the backing store
-        self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
  
          # Dentries in ROOT_INO are present
          self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
@@ -87,7 +87,7 @@ class TestJournalRepair(CephFSTestCase):
  
          # Now check the MDS can read what we wrote: truncate the journal
          # and start the mds.
-        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.journal_tool(['journal', 'reset'], 0)
          self.fs.mds_fail_restart()
          self.fs.wait_for_daemons()
  
@@ -265,10 +265,10 @@ class TestJournalRepair(CephFSTestCase):
          self.fs.mds_stop(active_mds_names[0])
          self.fs.mds_fail(active_mds_names[0])
          # Invoke recover_dentries quietly, because otherwise log spews millions of lines
-        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
-        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
          self.fs.table_tool(["0", "reset", "session"])
-        self.fs.journal_tool(["journal", "reset"], rank=0)
+        self.fs.journal_tool(["journal", "reset"], 0)
          self.fs.erase_mds_objects(1)
          self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                  '--yes-i-really-mean-it')
diff --git a/ceph/qa/tasks/cephfs/test_misc.py b/ceph/qa/tasks/cephfs/test_misc.py

index 4158538fd41d25f3d17c15171300f28ba8fb6751..c27278008a4ba58693959f064f45007f5c393436 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_misc.py
+++ b/ceph/qa/tasks/cephfs/test_misc.py
@@ -7,15 +7,13 @@ import errno
  import time
  import json
  import logging
+import time
  
  log = logging.getLogger(__name__)
  
  class TestMisc(CephFSTestCase):
      CLIENTS_REQUIRED = 2
  
-    LOAD_SETTINGS = ["mds_session_autoclose"]
-    mds_session_autoclose = None
-
      def test_getattr_caps(self):
          """
          Check if MDS recognizes the 'mask' parameter of open request.
@@ -43,6 +41,16 @@ class TestMisc(CephFSTestCase):
  
          self.mount_a.kill_background(p)
  
+    def test_root_rctime(self):
+        """
+        Check that the root inode has a non-default rctime on startup.
+        """
+
+        t = time.time()
+        rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
+        log.info("rctime = {}".format(rctime))
+        self.assertGreaterEqual(rctime, t-10)
+
      def test_fs_new(self):
          data_pool_name = self.fs.get_data_pool_name()
  
@@ -106,6 +114,8 @@ class TestMisc(CephFSTestCase):
          only session
          """
  
+        session_autoclose = self.fs.get_var("session_autoclose")
+
          self.mount_b.umount_wait()
          ls_data = self.fs.mds_asok(['session', 'ls'])
          self.assert_session_count(1, ls_data)
@@ -113,7 +123,7 @@ class TestMisc(CephFSTestCase):
          self.mount_a.kill()
          self.mount_a.kill_cleanup()
  
-        time.sleep(self.mds_session_autoclose * 1.5)
+        time.sleep(session_autoclose * 1.5)
          ls_data = self.fs.mds_asok(['session', 'ls'])
          self.assert_session_count(1, ls_data)
  
@@ -128,7 +138,7 @@ class TestMisc(CephFSTestCase):
          self.mount_a.kill()
          self.mount_a.kill_cleanup()
  
-        time.sleep(self.mds_session_autoclose * 1.5)
+        time.sleep(session_autoclose * 1.5)
          ls_data = self.fs.mds_asok(['session', 'ls'])
          self.assert_session_count(1, ls_data)
  
@@ -202,3 +212,75 @@ class TestMisc(CephFSTestCase):
  
          ratio = raw_avail / fs_avail
          assert 0.9 < ratio < 1.1
+
+    def _run_drop_cache_cmd(self, timeout, use_tell):
+        drop_res = None
+        if use_tell:
+            mds_id = self.fs.get_lone_mds_id()
+            drop_res = json.loads(
+                self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id),
+                                                    "cache", "drop", str(timeout)))
+        else:
+            drop_res = self.fs.mds_asok(["cache", "drop", str(timeout)])
+        return drop_res
+
+    def _drop_cache_command(self, timeout, use_tell=True):
+        self.mount_b.umount_wait()
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file", 1000)
+        # drop cache
+        drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+        self.assertTrue(drop_res['client_recall']['return_code'] == 0)
+        self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+    def _drop_cache_command_timeout(self, timeout, use_tell=True):
+        self.mount_b.umount_wait()
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        # create some files
+        self.mount_a.create_n_files("dc-dir/dc-file-t", 1000)
+
+        # simulate client death and try drop cache
+        self.mount_a.kill()
+        drop_res = self._run_drop_cache_cmd(timeout, use_tell)
+
+        self.assertTrue(drop_res['client_recall']['return_code'] == -errno.ETIMEDOUT)
+        self.assertTrue(drop_res['flush_journal']['return_code'] == 0)
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_drop_cache_command_asok(self):
+        """
+        Basic test for checking drop cache command using admin socket.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command(10, use_tell=False)
+
+    def test_drop_cache_command_tell(self):
+        """
+        Basic test for checking drop cache command using tell interface.
+        Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command(10)
+
+    def test_drop_cache_command_timeout_asok(self):
+        """
+        Check drop cache command with non-responding client using admin
+        socket. Note that the cache size post trimming is not checked here.
+        """
+        self._drop_cache_command_timeout(5, use_tell=False)
+
+    def test_drop_cache_command_timeout_tell(self):
+        """
+        Check drop cache command with non-responding client using tell
+        interface. Note that the cache size post trimming is not checked
+        here.
+        """
+        self._drop_cache_command_timeout(5)
diff --git a/ceph/qa/tasks/cephfs/test_recovery_pool.py b/ceph/qa/tasks/cephfs/test_recovery_pool.py

index 097342a9d48bfca6a4f93a699c09591e5aba2062..97049b9c0a3374af448c708d1d34370380f3a8ab 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_recovery_pool.py
+++ b/ceph/qa/tasks/cephfs/test_recovery_pool.py
@@ -141,10 +141,6 @@ class TestRecoveryPool(CephFSTestCase):
          self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                  '--yes-i-really-mean-it')
  
-        def get_state(mds_id):
-            info = self.mds_cluster.get_mds_info(mds_id)
-            return info['state'] if info is not None else None
-
          self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
          self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
          self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
@@ -153,7 +149,7 @@ class TestRecoveryPool(CephFSTestCase):
          if False:
              with self.assertRaises(CommandFailedError):
                  # Normal reset should fail when no objects are present, we'll use --force instead
-                self.fs.journal_tool(["journal", "reset"])
+                self.fs.journal_tool(["journal", "reset"], 0)
  
          self.fs.mds_stop()
          self.fs.data_scan(['scan_extents', '--alternate-pool',
@@ -163,22 +159,18 @@ class TestRecoveryPool(CephFSTestCase):
                             recovery_pool, '--filesystem', self.fs.name,
                             '--force-corrupt', '--force-init',
                             self.fs.get_data_pool_name()])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                              'recover_dentries', 'list',
-                              '--alternate-pool', recovery_pool])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool], 0)
  
          self.fs.data_scan(['init', '--force-init', '--filesystem',
                             self.fs.name])
          self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
                             '--force-corrupt', '--force-init',
                             self.fs.get_data_pool_name()])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
-                              'recover_dentries', 'list'])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
  
-        self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
-                              'reset', '--force'])
-        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
-                              'reset', '--force'])
+        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
          self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
                                              recovery_fs + ":0")
  
@@ -190,12 +182,11 @@ class TestRecoveryPool(CephFSTestCase):
          self.recovery_fs.mds_restart()
          self.fs.wait_for_daemons()
          self.recovery_fs.wait_for_daemons()
-        for mds_id in self.recovery_fs.mds_ids:
-            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+        status = self.recovery_fs.status()
+        for rank in self.recovery_fs.get_ranks(status=status):
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
                                                  'injectargs', '--debug-mds=20')
-            self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
-                                                'scrub_path', '/',
-                                                'recursive', 'repair')
+            self.fs.rank_asok(['scrub_path', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
          log.info(str(self.mds_cluster.status()))
  
          # Mount a client
diff --git a/ceph/qa/tasks/qemu.py b/ceph/qa/tasks/qemu.py

index f597c08d65b0bb68e69e4b221346d3b921e84bd5..b2bca00d8d1b2cd2a7e72cd2c9c3386983a26ffd 100644 (file)
--- a/ceph/qa/tasks/qemu.py
+++ b/ceph/qa/tasks/qemu.py
@@ -115,7 +115,7 @@ def generate_iso(ctx, config):
  
          (remote,) = ctx.cluster.only(client).remotes.keys()
  
-        clone_dir = '{tdir}/clone.{role}'.format(tdir=testdir, role=client)
+        clone_dir = '{tdir}/qemu_clone.{role}'.format(tdir=testdir, role=client)
          remote.run(args=refspec.clone(git_url, clone_dir))
  
          src_dir = os.path.dirname(__file__)
@@ -212,7 +212,7 @@ def generate_iso(ctx, config):
                      os.path.join(testdir, 'qemu', 'userdata.' + client),
                      os.path.join(testdir, 'qemu', 'metadata.' + client),
                      '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
-                    '{tdir}/clone.{client}'.format(tdir=testdir, client=client),
+                    '{tdir}/qemu_clone.{client}'.format(tdir=testdir, client=client),
                      ],
                  )
  
diff --git a/ceph/qa/tasks/thrashosds-health.yaml b/ceph/qa/tasks/thrashosds-health.yaml

index 111e2d8c48c50c7456a76f4ffd2a0ec099cd6056..9a4d35abf6ba9f22e6b533b725f3d2b37219502e 100644 (file)
--- a/ceph/qa/tasks/thrashosds-health.yaml
+++ b/ceph/qa/tasks/thrashosds-health.yaml
@@ -12,4 +12,4 @@ overrides:
        - \(REQUEST_SLOW\)
        - \(TOO_FEW_PGS\)
        - \(MON_DOWN\)
-      - slow requests
+      - slow request
diff --git a/ceph/qa/tasks/workunit.py b/ceph/qa/tasks/workunit.py

index 0a46ade76ab3e503b61330973f44a2152fc8bc66..17cfaf7f124d31effe1f6a07f029ef8603271c5b 100644 (file)
--- a/ceph/qa/tasks/workunit.py
+++ b/ceph/qa/tasks/workunit.py
@@ -410,7 +410,7 @@ def _run_tests(ctx, refspec, role, tests, env, basedir,
                  )
                  if cleanup:
                      args=['sudo', 'rm', '-rf', '--', scratch_tmp]
-                    remote.run(logger=log.getChild(role), args=args, timeout=(15*60))
+                    remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
      finally:
          log.info('Stopping %s on %s...', tests, role)
          args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
diff --git a/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh b/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh

index 30e74cce5b56da7c2ce6e355501521edfc558140..4a9f0a66f32e5db419aaac18a11ebcaf6225e44f 100755 (executable)
--- a/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
+++ b/ceph/qa/workunits/ceph-tests/ceph-admin-commands.sh
@@ -1,12 +1,9 @@
-#!/bin/sh -e
+#!/bin/sh -ex
  
-#check ceph health
  ceph -s
-#list pools
  rados lspools
-#lisr rbd images
  rbd ls
-#check that the monitors work
+# check that the monitors work
  ceph osd set nodown
  ceph osd unset nodown
  
diff --git a/ceph/qa/workunits/mon/test_config_key_caps.sh b/ceph/qa/workunits/mon/test_config_key_caps.sh

new file mode 100755 (executable)

index 0000000..77b4b53
--- /dev/null
+++ b/ceph/qa/workunits/mon/test_config_key_caps.sh
@@ -0,0 +1,201 @@
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+tmp=$(mktemp -d -p /tmp test_mon_config_key_caps.XXXXX)
+entities=()
+
+function cleanup()
+{
+       set +e
+       set +x
+       if [[ -e $tmp/keyring ]] && [[ -e $tmp/keyring.orig ]]; then
+               grep '\[.*\..*\]' $tmp/keyring.orig > $tmp/entities.orig
+               for e in $(grep '\[.*\..*\]' $tmp/keyring | \
+                       diff $tmp/entities.orig - | \
+                       sed -n 's/^.*\[\(.*\..*\)\]/\1/p');
+               do
+                       ceph auth rm $e 2>&1 >& /dev/null
+               done
+       fi
+       #rm -fr $tmp
+}
+
+trap cleanup 0 # cleanup on exit
+
+function expect_false()
+{
+       set -x
+       if "$@"; then return 1; else return 0; fi
+}
+
+# for cleanup purposes
+ceph auth export -o $tmp/keyring.orig
+
+k=$tmp/keyring
+
+# setup a few keys
+ceph config-key ls
+ceph config-key set daemon-private/osd.123/test-foo
+ceph config-key set mgr/test-foo
+ceph config-key set device/test-foo
+ceph config-key set test/foo
+
+allow_aa=client.allow_aa
+allow_bb=client.allow_bb
+allow_cc=client.allow_cc
+
+mgr_a=mgr.a
+mgr_b=mgr.b
+osd_a=osd.100
+osd_b=osd.200
+
+prefix_aa=client.prefix_aa
+prefix_bb=client.prefix_bb
+prefix_cc=client.prefix_cc
+match_aa=client.match_aa
+match_bb=client.match_bb
+
+fail_aa=client.fail_aa
+fail_bb=client.fail_bb
+fail_cc=client.fail_cc
+fail_dd=client.fail_dd
+fail_ee=client.fail_ee
+fail_ff=client.fail_ff
+fail_gg=client.fail_gg
+fail_writes=client.fail_writes
+
+ceph auth get-or-create $allow_aa mon 'allow *'
+ceph auth get-or-create $allow_bb mon 'allow service config-key rwx'
+ceph auth get-or-create $allow_cc mon 'allow command "config-key get"'
+
+ceph auth get-or-create $mgr_a mon 'allow profile mgr'
+ceph auth get-or-create $mgr_b mon 'allow profile mgr'
+ceph auth get-or-create $osd_a mon 'allow profile osd'
+ceph auth get-or-create $osd_b mon 'allow profile osd'
+
+ceph auth get-or-create $prefix_aa mon \
+       "allow command \"config-key get\" with key prefix client/$prefix_aa"
+
+cap="allow command \"config-key set\" with key prefix client/"
+cap="$cap,allow command \"config-key get\" with key prefix client/$prefix_bb"
+ceph auth get-or-create $prefix_bb mon "$cap"
+
+cap="allow command \"config-key get\" with key prefix client/"
+cap="$cap, allow command \"config-key set\" with key prefix client/"
+cap="$cap, allow command \"config-key ls\""
+ceph auth get-or-create $prefix_cc mon "$cap"
+
+cap="allow command \"config-key get\" with key=client/$match_aa/foo"
+ceph auth get-or-create $match_aa mon "$cap"
+cap="allow command \"config-key get\" with key=client/$match_bb/foo"
+cap="$cap,allow command \"config-key set\" with key=client/$match_bb/foo"
+ceph auth get-or-create $match_bb mon "$cap"
+
+ceph auth get-or-create $fail_aa mon 'allow rx'
+ceph auth get-or-create $fail_bb mon 'allow r,allow w'
+ceph auth get-or-create $fail_cc mon 'allow rw'
+ceph auth get-or-create $fail_dd mon 'allow rwx'
+ceph auth get-or-create $fail_ee mon 'allow profile bootstrap-rgw'
+ceph auth get-or-create $fail_ff mon 'allow profile bootstrap-rbd'
+# write commands will require rw; wx is not enough
+ceph auth get-or-create $fail_gg mon 'allow service config-key wx'
+# read commands will only require 'r'; 'rx' should be enough.
+ceph auth get-or-create $fail_writes mon 'allow service config-key rx'
+
+# grab keyring
+ceph auth export -o $k
+
+# keys will all the caps can do whatever
+for c in $allow_aa $allow_bb $allow_cc $mgr_a $mgr_b; do
+       ceph -k $k --name $c config-key get daemon-private/osd.123/test-foo
+       ceph -k $k --name $c config-key get mgr/test-foo
+       ceph -k $k --name $c config-key get device/test-foo
+       ceph -k $k --name $c config-key get test/foo
+done
+
+for c in $osd_a $osd_b; do
+       ceph -k $k --name $c config-key put daemon-private/$c/test-foo
+       ceph -k $k --name $c config-key get daemon-private/$c/test-foo
+       expect_false ceph -k $k --name $c config-key ls
+       expect_false ceph -k $k --name $c config-key get mgr/test-foo
+       expect_false ceph -k $k --name $c config-key get device/test-foo
+       expect_false ceph -k $k --name $c config-key get test/foo
+done
+
+expect_false ceph -k $k --name $osd_a get daemon-private/$osd_b/test-foo
+expect_false ceph -k $k --name $osd_b get daemon-private/$osd_a/test-foo
+
+expect_false ceph -k $k --name $prefix_aa \
+       config-key ls
+expect_false ceph -k $k --name $prefix_aa \
+       config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_aa \
+       config-key set test/bar
+expect_false ceph -k $k --name $prefix_aa \
+       config-key set client/$prefix_aa/foo
+
+# write something so we can read, use a custom entity
+ceph -k $k --name $allow_bb config-key set client/$prefix_aa/foo
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/foo
+# check one writes to the other's prefix, the other is able to read
+ceph -k $k --name $prefix_bb config-key set client/$prefix_aa/bar
+ceph -k $k --name $prefix_aa config-key get client/$prefix_aa/bar
+
+ceph -k $k --name $prefix_bb config-key set client/$prefix_bb/foo
+ceph -k $k --name $prefix_bb config-key get client/$prefix_bb/foo
+
+expect_false ceph -k $k --name $prefix_bb config-key get client/$prefix_aa/bar
+expect_false ceph -k $k --name $prefix_bb config-key ls
+expect_false ceph -k $k --name $prefix_bb \
+       config-key get daemon-private/osd.123/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get mgr/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get device/test-foo
+expect_false ceph -k $k --name $prefix_bb config-key get test/bar
+expect_false ceph -k $k --name $prefix_bb config-key set test/bar
+
+ceph -k $k --name $prefix_cc config-key set client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key set client/$match_bb/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_aa/foo
+ceph -k $k --name $prefix_cc config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $prefix_cc config-key set other/prefix
+expect_false ceph -k $k --name $prefix_cc config-key get mgr/test-foo
+ceph -k $k --name $prefix_cc config-key ls >& /dev/null
+
+ceph -k $k --name $match_aa config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_aa config-key get client/$match_bb/foo
+expect_false ceph -k $k --name $match_aa config-key set client/$match_aa/foo
+ceph -k $k --name $match_bb config-key get client/$match_bb/foo
+ceph -k $k --name $match_bb config-key set client/$match_bb/foo
+expect_false ceph -k $k --name $match_bb config-key get client/$match_aa/foo
+expect_false ceph -k $k --name $match_bb config-key set client/$match_aa/foo
+
+keys=(daemon-private/osd.123/test-foo
+         mgr/test-foo
+         device/test-foo
+         test/foo
+         client/$prefix_aa/foo
+         client/$prefix_bb/foo
+         client/$match_aa/foo
+         client/$match_bb/foo
+)
+# expect these all to fail accessing config-key
+for c in $fail_aa $fail_bb $fail_cc \
+                $fail_dd $fail_ee $fail_ff \
+                $fail_gg; do
+       for m in get set; do
+               for key in ${keys[*]} client/$prefix_aa/foo client/$prefix_bb/foo; do
+                       expect_false ceph -k $k --name $c config-key $m $key
+               done
+       done
+done
+
+# fail writes but succeed on reads
+expect_false ceph -k $k --name $fail_writes config-key set client/$match_aa/foo
+expect_false ceph -k $k --name $fail_writes config-key set test/foo
+ceph -k $k --name $fail_writes config-key ls
+ceph -k $k --name $fail_writes config-key get client/$match_aa/foo 
+ceph -k $k --name $fail_writes config-key get daemon-private/osd.123/test-foo
+
+echo "OK"
diff --git a/ceph/qa/workunits/rados/test_librados_build.sh b/ceph/qa/workunits/rados/test_librados_build.sh

index 43ded25b2036e86ae1c30bfd3779384d671c5de4..3aaaec7eb2302209c8549b82558a913dd2697353 100755 (executable)
--- a/ceph/qa/workunits/rados/test_librados_build.sh
+++ b/ceph/qa/workunits/rados/test_librados_build.sh
@@ -20,8 +20,8 @@ hello_world_cpp
  "
  BINARIES="${BINARIES_TO_RUN}hello_radosstriper_cpp
  "
-DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;f=examples/librados/"
-#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/master/examples/librados/"
+DL_PREFIX="http://git.ceph.com/?p=ceph.git;a=blob_plain;hb=luminous;f=examples/librados/"
+#DL_PREFIX="https://raw.githubusercontent.com/ceph/ceph/luminous/examples/librados/"
  DESTDIR=$(pwd)
  
  function cleanup () {
diff --git a/ceph/qa/workunits/rbd/run_devstack_tempest.sh b/ceph/qa/workunits/rbd/run_devstack_tempest.sh

index 7ee21f09f02dd18be457572306b08a052c7ced51..65a45d8b715d3846cfd9c7d5eb22217d6a07cf1b 100755 (executable)
--- a/ceph/qa/workunits/rbd/run_devstack_tempest.sh
+++ b/ceph/qa/workunits/rbd/run_devstack_tempest.sh
@@ -1,7 +1,7 @@
  #!/bin/bash -ex
  
-STACK_BRANCH=stable/pike
-TEMPEST_BRANCH=17.2.0
+STACK_BRANCH=stable/rocky
+TEMPEST_BRANCH=19.0.0
  
  STACK_USER=${STACK_USER:-stack}
  STACK_GROUP=${STACK_GROUP:-stack}
diff --git a/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh b/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh

index 60e91496511f9cbb446e73184f36e9d2cc7f92a9..7e4ad3bd3f5d0ddcd2aa80ed2727f82befaa3e64 100755 (executable)
--- a/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh
+++ b/ceph/qa/workunits/suites/cephfs_journal_tool_smoke.sh
@@ -3,7 +3,7 @@
  set -e
  set -x
  
-export BIN="${BIN:-cephfs-journal-tool}"
+export BIN="${BIN:-cephfs-journal-tool --rank=cephfs:0}"
  export JOURNAL_FILE=/tmp/journal.bin
  export JSON_OUTPUT=/tmp/json.tmp
  export BINARY_OUTPUT=/tmp/binary.tmp
diff --git a/ceph/run-make-check.sh b/ceph/run-make-check.sh

index 078345422763426fd479e85cde219d242de3242f..2244e5ea5b0aac5b8c0c0e253e5e83182d0b5297 100755 (executable)
--- a/ceph/run-make-check.sh
+++ b/ceph/run-make-check.sh
@@ -13,8 +13,31 @@
  #
  
  #
-# Return MAX(1, (number of processors / 2)) by default or NPROC
+# To just look at what this script will do, run it like this:
  #
+# $ DRY_RUN=echo ./run-make-check.sh
+#
+
+set -e
+
+trap clean_up_after_myself EXIT
+
+ORIGINAL_CCACHE_CONF="$HOME/.ccache/ccache.conf"
+SAVED_CCACHE_CONF="$HOME/.run-make-check-saved-ccache-conf"
+
+function save_ccache_conf() {
+    test -f $ORIGINAL_CCACHE_CONF && cp $ORIGINAL_CCACHE_CONF $SAVED_CCACHE_CONF || true
+}
+
+function restore_ccache_conf() {
+    test -f $SAVED_CCACHE_CONF && mv $SAVED_CCACHE_CONF $ORIGINAL_CCACHE_CONF || true
+}
+
+function clean_up_after_myself() {
+    rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
+    restore_ccache_conf
+}
+
  function get_processors() {
      if test -n "$NPROC" ; then
          echo $NPROC
@@ -54,26 +77,72 @@ function run() {
          exit 1
      fi
      if [ -n "$install_cmd" ]; then
-        $DRY_RUN sudo $install_cmd ccache jq $which_pkg
+        $DRY_RUN sudo $install_cmd ccache $which_pkg
      else
          echo "WARNING: Don't know how to install packages" >&2
          echo "This probably means distribution $ID is not supported by run-make-check.sh" >&2
      fi
  
+    if ! type ccache > /dev/null 2>&1 ; then
+        echo "ERROR: ccache could not be installed"
+        exit 1
+    fi
+
      if test -f ./install-deps.sh ; then
         $DRY_RUN ./install-deps.sh || return 1
+        trap clean_up_after_myself EXIT
      fi
  
      # Init defaults after deps are installed. get_processors() depends on coreutils nproc.
      DEFAULT_MAKEOPTS=${DEFAULT_MAKEOPTS:--j$(get_processors)}
      BUILD_MAKEOPTS=${BUILD_MAKEOPTS:-$DEFAULT_MAKEOPTS}
+    test "$BUILD_MAKEOPTS" && echo "make will run with option(s) $BUILD_MAKEOPTS"
      CHECK_MAKEOPTS=${CHECK_MAKEOPTS:-$DEFAULT_MAKEOPTS}
  
-    $DRY_RUN ./do_cmake.sh $@ || return 1
+    if type python2 > /dev/null 2>&1 ; then
+        # gtest-parallel requires Python 2
+        CMAKE_PYTHON_OPTS="-DWITH_GTEST_PARALLEL=ON"
+    else
+        CMAKE_PYTHON_OPTS="-DWITH_PYTHON2=OFF -DWITH_PYTHON3=ON -DMGR_PYTHON_VERSION=3 -DWITH_GTEST_PARALLEL=OFF"
+    fi
+
+    CMAKE_BUILD_OPTS=""
+
+    cat <<EOM
+Note that the binaries produced by this script do not contain correct time
+and git version information, which may make them unsuitable for debugging
+and production use.
+EOM
+    save_ccache_conf
+    # remove the entropy generated by the date/time embedded in the build
+    CMAKE_BUILD_OPTS="$CMAKE_BUILD_OPTS -DENABLE_GIT_VERSION=OFF"
+    $DRY_RUN export SOURCE_DATE_EPOCH="946684800"
+    $DRY_RUN ccache -o sloppiness=time_macros
+    $DRY_RUN ccache -o run_second_cpp=true
+    if [ -n "$JENKINS_HOME" ]; then
+        # Build host has plenty of space available, let's use it to keep
+        # various versions of the built objects. This could increase the cache hit
+        # if the same or similar PRs are running several times
+        $DRY_RUN ccache -o max_size=100G
+    else
+        echo "Current ccache max_size setting:"
+        ccache -p | grep max_size
+    fi
+    $DRY_RUN ccache -z # Reset the ccache statistics
+
+    $DRY_RUN ./do_cmake.sh $CMAKE_BUILD_OPTS $CMAKE_PYTHON_OPTS $@ || return 1
      $DRY_RUN cd build
      $DRY_RUN make $BUILD_MAKEOPTS tests || return 1
-    # prevent OSD EMFILE death on tests
-    $DRY_RUN sudo ulimit -n 32768
+
+    $DRY_RUN ccache -s # print the ccache statistics to evaluate the efficiency
+
+    # to prevent OSD EMFILE death on tests, make sure ulimit >= 1024
+    $DRY_RUN ulimit -n $(ulimit -Hn)
+    if [ $(ulimit -n) -lt 1024 ];then
+        echo "***ulimit -n too small, better bigger than 1024 for test***"
+        return 1
+    fi
+
      if ! $DRY_RUN ctest $CHECK_MAKEOPTS --output-on-failure; then
          rm -f ${TMPDIR:-/tmp}/ceph-asok.*
          return 1
@@ -86,21 +155,14 @@ function main() {
          echo "with the ability to run commands as root via sudo."
      fi
      echo -n "Checking hostname sanity... "
-    if hostname --fqdn >/dev/null 2>&1 ; then
+    if $DRY_RUN hostname --fqdn >/dev/null 2>&1 ; then
          echo "OK"
      else
          echo "NOT OK"
          echo "Please fix 'hostname --fqdn', otherwise 'make check' will fail"
          return 1
      fi
-    if run "$@" ; then
-        rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
-        echo "cmake check: successful run on $(git rev-parse HEAD)"
-        return 0
-    else
-        rm -fr ${CEPH_BUILD_VIRTUALENV:-/tmp}/*virtualenv*
-        return 1
-    fi
+    run "$@" && echo "make check: successful run on $(git rev-parse HEAD)"
  }
  
  main "$@"
diff --git a/ceph/src/.git_version b/ceph/src/.git_version

index fc407817f0ed34a6716f4749e8d6acea98e6df14..268e02b634ff466afc8a44503feb7da8173f4886 100644 (file)
--- a/ceph/src/.git_version
+++ b/ceph/src/.git_version
@@ -1,2 +1,2 @@
-177915764b752804194937482a39e95e0ca3de94
-v12.2.10
+26dc3775efc7bb286a1d6d66faee0ba30ea23eee
+v12.2.11
diff --git a/ceph/src/auth/AuthSessionHandler.cc b/ceph/src/auth/AuthSessionHandler.cc

index ab46b60c57999309532569d480b17d93f01f8c48..286e383f63f578cf4d271216df447941052cbc05 100644 (file)
--- a/ceph/src/auth/AuthSessionHandler.cc
+++ b/ceph/src/auth/AuthSessionHandler.cc
@@ -30,6 +30,10 @@ AuthSessionHandler *get_auth_session_handler(CephContext *cct, int protocol, Cry
   
    switch (protocol) {
    case CEPH_AUTH_CEPHX:
+    // if there is no session key, there is no session handler.
+    if (key.get_type() == CEPH_CRYPTO_NONE) {
+      return nullptr;
+    }
      return new CephxSessionHandler(cct, key, features);
    case CEPH_AUTH_NONE:
      return new AuthNoneSessionHandler(cct, key);
diff --git a/ceph/src/ceph-create-keys b/ceph/src/ceph-create-keys

index c14c02f28dc0dcadfef53d30e8bb6d832586ded1..41d76e1578dfdeca1baefb4c1571165dad9b7b70 100755 (executable)
--- a/ceph/src/ceph-create-keys
+++ b/ceph/src/ceph-create-keys
@@ -91,12 +91,12 @@ def get_key(cluster, mon_id, wait_count=600):
      pathdir = os.path.dirname(path)
      if not os.path.exists(pathdir):
          os.makedirs(pathdir)
-        os.chmod(pathdir, 0770)
+        os.chmod(pathdir, 0o770)
          os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
      while wait_count > 0:
          try:
-            with file(tmp, 'w') as f:
-                os.fchmod(f.fileno(), 0600)
+            with open(tmp, 'w') as f:
+                os.fchmod(f.fileno(), 0o600)
                  os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                  LOG.info('Talking to monitor...')
  
@@ -201,13 +201,13 @@ def bootstrap_key(cluster, type_, wait_count=600):
      pathdir = os.path.dirname(path)
      if not os.path.exists(pathdir):
          os.makedirs(pathdir)
-        os.chmod(pathdir, 0770)
+        os.chmod(pathdir, 0o770)
          os.chown(pathdir, get_ceph_uid(), get_ceph_gid())
  
      while wait_count > 0:
          try:
-            with file(tmp, 'w') as f:
-                os.fchmod(f.fileno(), 0600)
+            with open(tmp, 'w') as f:
+                os.fchmod(f.fileno(), 0o600)
                  os.fchown(f.fileno(), get_ceph_uid(), get_ceph_gid())
                  LOG.info('Talking to monitor...')
                  returncode = subprocess.call(
diff --git a/ceph/src/ceph-volume/ceph_volume/api/lvm.py b/ceph/src/ceph-volume/ceph_volume/api/lvm.py

index aed4a8f6417c4fc31337f3868efd190cf9104041..bcb54d65bf472b4e679bd9db02e13708a00ab6a0 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/ceph/src/ceph-volume/ceph_volume/api/lvm.py
@@ -466,6 +466,9 @@ def remove_vg(vg_name):
      """
      Removes a volume group.
      """
+    if not vg_name:
+        logger.warning('Skipping removal of invalid VG name: "%s"', vg_name)
+        return
      fail_msg = "Unable to remove vg %s" % vg_name
      process.run(
          [
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py

index 852c314c2993e3d44d370a590ebe5a52c73dcdf4..1ad15bc80a81c2386129da98e9fcb848e5ab8cc0 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/activate.py
@@ -63,6 +63,9 @@ def activate_filestore(lvs, no_systemd=False):
      if not system.device_is_mounted(source, destination=destination):
          prepare_utils.mount_osd(source, osd_id, is_vdo=is_vdo)
  
+    # ensure that the OSD destination is always chowned properly
+    system.chown(destination)
+
      # always re-do the symlink regardless if it exists, so that the journal
      # device path that may have changed can be mapped correctly every time
      destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
@@ -151,7 +154,10 @@ def activate_bluestore(lvs, no_systemd=False):
      db_device_path = get_osd_device_path(osd_lv, lvs, 'db', dmcrypt_secret=dmcrypt_secret)
      wal_device_path = get_osd_device_path(osd_lv, lvs, 'wal', dmcrypt_secret=dmcrypt_secret)
  
-    # Once symlinks are removed, the osd dir can be 'primed again.
+    # Once symlinks are removed, the osd dir can be 'primed again. chown first,
+    # regardless of what currently exists so that ``prime-osd-dir`` can succeed
+    # even if permissions are somehow messed up
+    system.chown(osd_path)
      prime_command = [
          'ceph-bluestore-tool', '--cluster=%s' % conf.cluster,
          'prime-osd-dir', '--dev', osd_lv_path,
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py

index cce58b166d05b58eaf4104a627de88dce24acd84..76a52f37dea664eadf60e63df5cf2f3e3d8c427c 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py
@@ -139,14 +139,11 @@ class Batch(object):
          self.argv = argv
  
      def get_devices(self):
-        all_devices = disk.get_devices()
          # remove devices with partitions
-        # XXX Should be optional when getting device info
-        for device, detail in all_devices.items():
-            if detail.get('partitions') != {}:
-                del all_devices[device]
-        devices = sorted(all_devices.items(), key=lambda x: (x[0], x[1]['size']))
-        return device_formatter(devices)
+        devices = [(device, details) for device, details in
+                       disk.get_devices().items() if details.get('partitions') == {}]
+        size_sort = lambda x: (x[0], x[1]['size'])
+        return device_formatter(sorted(devices, key=size_sort))
  
      def print_help(self):
          return self._help.format(
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py

index 92dc3a2e90a74dd8f62bb4d0ac60bc1a73771fa0..ee269a39463f799c79885d1b9d0aaf11ea42baa7 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/bluestore.py
@@ -1,47 +1,28 @@
  from __future__ import print_function
-import json
  from ceph_volume.util import disk, prepare
  from ceph_volume.api import lvm
  from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
  from ceph_volume.devices.lvm.create import Create
  from ceph_volume.devices.lvm.prepare import Prepare
  from ceph_volume.util import templates
  from ceph_volume.exceptions import SizeAllocationError
  
  
-class SingleType(object):
+class SingleType(Strategy):
      """
      Support for all SSDs, or all HDDS
      """
  
      def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        # TODO: add --fast-devices and --slow-devices so these can be customized
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        super(SingleType, self).__init__(devices, args)
+        self.validate_compute()
  
      @staticmethod
      def type():
          return "bluestore.SingleType"
  
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
      def report_pretty(self):
          string = ""
          if self.args.filtered_devices:
@@ -141,32 +122,19 @@ class SingleType(object):
                      Create(command).main()
  
  
-class MixedType(object):
+class MixedType(MixedStrategy):
  
      def __init__(self, devices, args):
-        self.args = args
-        self.devices = devices
-        self.osds_per_device = args.osds_per_device
-        # TODO: add --fast-devices and --slow-devices so these can be customized
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'filtered_devices': args.filtered_devices}
+        super(MixedType, self).__init__(devices, args)
          self.block_db_size = self.get_block_size()
          self.system_vgs = lvm.VolumeGroups()
          self.dbs_needed = len(self.hdds) * self.osds_per_device
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
  
      @staticmethod
      def type():
          return "bluestore.MixedType"
  
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
      def get_block_size(self):
          if self.args.block_db_size:
              return disk.Size(b=self.args.block_db_size)
@@ -319,17 +287,6 @@ class MixedType(object):
              else:
                  Create(command).main()
  
-    def get_common_vg(self):
-        # find all the vgs associated with the current device
-        for ssd in self.ssds:
-            for pv in ssd.pvs_api:
-                vg = self.system_vgs.get(vg_name=pv.vg_name)
-                if not vg:
-                    continue
-                # this should give us just one VG, it would've been caught by
-                # the validator otherwise
-                return vg
-
      def validate(self):
          """
          HDDs represent data devices, and solid state devices are for block.db,
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py

index b94cc6ea38d545fa9e488c7a84d74f829e2d60ae..c01e83721383158f6862cc8614e0bcf0ec777b9c 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/filestore.py
@@ -1,8 +1,9 @@
  from __future__ import print_function
-import json
  from ceph_volume.util import disk, prepare
  from ceph_volume.api import lvm
  from . import validators
+from .strategies import Strategy
+from .strategies import MixedStrategy
  from ceph_volume.devices.lvm.create import Create
  from ceph_volume.devices.lvm.prepare import Prepare
  from ceph_volume.util import templates
@@ -20,40 +21,21 @@ def get_journal_size(args):
          return prepare.get_journal_size(lv_format=False)
  
  
-class SingleType(object):
+class SingleType(Strategy):
      """
      Support for all SSDs, or all HDDs, data and journal LVs will be colocated
      in the same device
      """
  
      def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+        super(SingleType, self).__init__(devices, args)
          self.journal_size = get_journal_size(args)
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
  
      @staticmethod
      def type():
          return "filestore.SingleType"
  
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
      def report_pretty(self):
          string = ""
          if self.args.filtered_devices:
@@ -176,7 +158,7 @@ class SingleType(object):
                  Create(command).main()
  
  
-class MixedType(object):
+class MixedType(MixedStrategy):
      """
      Supports HDDs with SSDs, journals will be placed on SSDs, while HDDs will
      be used fully for data.
@@ -186,36 +168,17 @@ class MixedType(object):
      """
  
      def __init__(self, devices, args):
-        self.args = args
-        self.osds_per_device = args.osds_per_device
-        self.devices = devices
-        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
-        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
-        self.computed = {'osds': [], 'vg': None, 'filtered_devices': args.filtered_devices}
+        super(MixedType, self).__init__(devices, args)
          self.blank_ssds = []
          self.journals_needed = len(self.hdds) * self.osds_per_device
          self.journal_size = get_journal_size(args)
          self.system_vgs = lvm.VolumeGroups()
-        if self.devices:
-            self.validate()
-            self.compute()
-        else:
-            self.computed["changed"] = False
+        self.validate_compute()
  
      @staticmethod
      def type():
          return "filestore.MixedType"
  
-    def report_json(self):
-        print(json.dumps(self.computed, indent=4, sort_keys=True))
-
-    @property
-    def total_osds(self):
-        if self.hdds:
-            return len(self.hdds) * self.osds_per_device
-        else:
-            return len(self.ssds) * self.osds_per_device
-
      def report_pretty(self):
          string = ""
          if self.args.filtered_devices:
@@ -252,17 +215,6 @@ class MixedType(object):
  
          print(string)
  
-    def get_common_vg(self):
-        # find all the vgs associated with the current device
-        for ssd in self.ssds:
-            for pv in ssd.pvs_api:
-                vg = self.system_vgs.get(vg_name=pv.vg_name)
-                if not vg:
-                    continue
-                # this should give us just one VG, it would've been caught by
-                # the validator otherwise
-                return vg
-
      def validate(self):
          """
          Ensure that the minimum requirements for this type of scenario is
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py

new file mode 100644 (file)

index 0000000..d4ec5a7
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/strategies/strategies.py
@@ -0,0 +1,50 @@
+import json
+
+class Strategy(object):
+
+    def __init__(self, devices, args):
+        self.args = args
+        self.osds_per_device = args.osds_per_device
+        self.devices = devices
+        self.hdds = [device for device in devices if device.sys_api['rotational'] == '1']
+        self.ssds = [device for device in devices if device.sys_api['rotational'] == '0']
+        self.computed = {'osds': [], 'vgs': [], 'filtered_devices': args.filtered_devices}
+
+    def validate_compute(self):
+        if self.devices:
+            self.validate()
+            self.compute()
+        else:
+            self.computed["changed"] = False
+
+    def report_json(self):
+        print(json.dumps(self.computed, indent=4, sort_keys=True))
+
+    @property
+    def total_osds(self):
+        if self.hdds:
+            return len(self.hdds) * self.osds_per_device
+        else:
+            return len(self.ssds) * self.osds_per_device
+
+    # protect against base class instantiation and incomplete implementations.
+    # We could also use the abc module and implement this as an
+    # AbstractBaseClass
+    def compute(self):
+        raise NotImplementedError('compute() must be implemented in a child class')
+
+    def execute(self):
+        raise NotImplementedError('execute() must be implemented in a child class')
+
+class MixedStrategy(Strategy):
+
+    def get_common_vg(self):
+        # find all the vgs associated with the current device
+        for ssd in self.ssds:
+            for pv in ssd.pvs_api:
+                vg = self.system_vgs.get(vg_name=pv.vg_name)
+                if not vg:
+                    continue
+                # this should give us just one VG, it would've been caught by
+                # the validator otherwise
+                return vg
diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py

index 8e0e3a3c5e974af2b24fcf636268f8a18d620783..328a036152e2c1ebb5242938524247e84d36c160 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -1,11 +1,14 @@
  import argparse
+import os
  import logging
  
  from textwrap import dedent
  
  from ceph_volume import decorators, terminal, process
  from ceph_volume.api import lvm as api
-from ceph_volume.util import system, encryption, disk
+from ceph_volume.util import system, encryption, disk, arg_validators
+from ceph_volume.util.device import Device
+from ceph_volume.systemd import systemctl
  
  logger = logging.getLogger(__name__)
  mlogger = terminal.MultiLogger(__name__)
@@ -39,6 +42,79 @@ def zap_data(path):
      ])
  
  
+def find_associated_devices(osd_id=None, osd_fsid=None):
+    """
+    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
+    system that match those tag values, further detect if any partitions are
+    part of the OSD, and then return the set of LVs and partitions (if any).
+    """
+    lv_tags = {}
+    if osd_id:
+        lv_tags['ceph.osd_id'] = osd_id
+    if osd_fsid:
+        lv_tags['ceph.osd_fsid'] = osd_fsid
+    lvs = api.Volumes()
+    lvs.filter(lv_tags=lv_tags)
+    if not lvs:
+        raise RuntimeError('Unable to find any LV for zapping OSD: %s' % osd_id or osd_fsid)
+
+    devices_to_zap = ensure_associated_lvs(lvs)
+
+    return [Device(path) for path in set(devices_to_zap) if path]
+
+
+def ensure_associated_lvs(lvs):
+    """
+    Go through each LV and ensure if backing devices (journal, wal, block)
+    are LVs or partitions, so that they can be accurately reported.
+    """
+    # look for many LVs for each backing type, because it is possible to
+    # receive a filtering for osd.1, and have multiple failed deployments
+    # leaving many journals with osd.1 - usually, only a single LV will be
+    # returned
+    journal_lvs = lvs._filter(lv_tags={'ceph.type': 'journal'})
+    db_lvs = lvs._filter(lv_tags={'ceph.type': 'db'})
+    wal_lvs = lvs._filter(lv_tags={'ceph.type': 'wal'})
+    backing_devices = [
+        (journal_lvs, 'journal'),
+        (db_lvs, 'block'),
+        (wal_lvs, 'wal')
+    ]
+
+    verified_devices = []
+
+    for lv in lvs:
+        # go through each lv and append it, otherwise query `blkid` to find
+        # a physical device. Do this for each type (journal,db,wal) regardless
+        # if they have been processed in the previous LV, so that bad devices
+        # with the same ID can be caught
+        for ceph_lvs, _type in backing_devices:
+            if ceph_lvs:
+                verified_devices.extend([l.lv_path for l in ceph_lvs])
+                continue
+
+            # must be a disk partition, by querying blkid by the uuid we are
+            # ensuring that the device path is always correct
+            try:
+                device_uuid = lv.tags['ceph.%s_uuid' % _type]
+            except KeyError:
+                # Bluestore will not have ceph.journal_uuid, and Filestore
+                # will not not have ceph.db_uuid
+                continue
+
+            osd_device = disk.get_device_from_partuuid(device_uuid)
+            if not osd_device:
+                # if the osd_device is not found by the partuuid, then it is
+                # not possible to ensure this device exists anymore, so skip it
+                continue
+            verified_devices.append(osd_device)
+
+        verified_devices.append(lv.lv_path)
+
+    # reduce the list from all the duplicates that were added
+    return list(set(verified_devices))
+
+
  class Zap(object):
  
      help = 'Removes all data and filesystems from a logical volume or partition.'
@@ -59,70 +135,128 @@ class Zap(object):
          if dmcrypt and dmcrypt_uuid:
              self.dmcrypt_close(dmcrypt_uuid)
  
+    def zap_lv(self, device):
+        """
+        Device examples: vg-name/lv-name, /dev/vg-name/lv-name
+        Requirements: Must be a logical volume (LV)
+        """
+        lv = api.get_lv(lv_name=device.lv_name, vg_name=device.vg_name)
+        self.unmount_lv(lv)
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
+        if self.args.destroy:
+            lvs = api.Volumes()
+            lvs.filter(vg_name=device.vg_name)
+            if len(lvs) <= 1:
+                mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', device.vg_name)
+                api.remove_vg(device.vg_name)
+            else:
+                mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
+                mlogger.info('Removing LV because --destroy was given: %s', device.abspath)
+                api.remove_lv(device.abspath)
+        elif lv:
+            # just remove all lvm metadata, leaving the LV around
+            lv.clear_tags()
+
+    def zap_partition(self, device):
+        """
+        Device example: /dev/sda1
+        Requirements: Must be a partition
+        """
+        if device.is_encrypted:
+            # find the holder
+            holders = [
+                '/dev/%s' % holder for holder in device.sys_api.get('holders', [])
+            ]
+            for mapper_uuid in os.listdir('/dev/mapper'):
+                mapper_path = os.path.join('/dev/mapper', mapper_uuid)
+                if os.path.realpath(mapper_path) in holders:
+                    self.dmcrypt_close(mapper_uuid)
+
+        if system.device_is_mounted(device.abspath):
+            mlogger.info("Unmounting %s", device.abspath)
+            system.unmount(device.abspath)
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
+        if self.args.destroy:
+            mlogger.info("Destroying partition since --destroy was used: %s" % device.abspath)
+            disk.remove_partition(device)
+
+    def zap_lvm_member(self, device):
+        """
+        An LVM member may have more than one LV and or VG, for example if it is
+        a raw device with multiple partitions each belonging to a different LV
+
+        Device example: /dev/sda
+        Requirements: An LV or VG present in the device, making it an LVM member
+        """
+        for lv in device.lvs:
+            self.zap_lv(Device(lv.lv_path))
+
+
+    def zap_raw_device(self, device):
+        """
+        Any whole (raw) device passed in as input will be processed here,
+        checking for LVM membership and partitions (if any).
+
+        Device example: /dev/sda
+        Requirements: None
+        """
+        if not self.args.destroy:
+            # the use of dd on a raw device causes the partition table to be
+            # destroyed
+            mlogger.warning(
+                '--destroy was not specified, but zapping a whole device will remove the partition table'
+            )
+
+        # look for partitions and zap those
+        for part_name in device.sys_api.get('partitions', {}).keys():
+            self.zap_partition(Device('/dev/%s' % part_name))
+
+        wipefs(device.abspath)
+        zap_data(device.abspath)
+
      @decorators.needs_root
-    def zap(self, args):
-        for device in args.devices:
-            if disk.is_mapper_device(device):
+    def zap(self, devices=None):
+        devices = devices or self.args.devices
+
+        for device in devices:
+            mlogger.info("Zapping: %s", device.abspath)
+            if device.is_mapper:
                  terminal.error("Refusing to zap the mapper device: {}".format(device))
                  raise SystemExit(1)
-            lv = api.get_lv_from_argument(device)
-            if lv:
-                # we are zapping a logical volume
-                path = lv.lv_path
-                self.unmount_lv(lv)
-            else:
-                # we are zapping a partition
-                #TODO: ensure device is a partition
-                path = device
-                # check to if it is encrypted to close
-                partuuid = disk.get_partuuid(device)
-                if encryption.status("/dev/mapper/{}".format(partuuid)):
-                    dmcrypt_uuid = partuuid
-                    self.dmcrypt_close(dmcrypt_uuid)
-
-            mlogger.info("Zapping: %s", path)
-
-            # check if there was a pv created with the
-            # name of device
-            pvs = api.PVolumes()
-            pvs.filter(pv_name=device)
-            vgs = set([pv.vg_name for pv in pvs])
-            for pv in pvs:
-                vg_name = pv.vg_name
-                lv = None
-                if pv.lv_uuid:
-                    lv = api.get_lv(vg_name=vg_name, lv_uuid=pv.lv_uuid)
-
-                if lv:
-                    self.unmount_lv(lv)
-
-            if args.destroy:
-                for vg_name in vgs:
-                    mlogger.info("Destroying volume group %s because --destroy was given", vg_name)
-                    api.remove_vg(vg_name)
-                if not lv:
-                    mlogger.info("Destroying physical volume %s because --destroy was given", device)
-                    api.remove_pv(device)
-
-            wipefs(path)
-            zap_data(path)
-
-            if lv and not pvs:
-                if args.destroy:
-                    lvs = api.Volumes()
-                    lvs.filter(vg_name=lv.vg_name)
-                    if len(lvs) <= 1:
-                        mlogger.info('Only 1 LV left in VG, will proceed to destroy volume group %s', lv.vg_name)
-                        api.remove_vg(lv.vg_name)
-                    else:
-                        mlogger.info('More than 1 LV left in VG, will proceed to destroy LV only')
-                        mlogger.info('Removing LV because --destroy was given: %s', lv)
-                        api.remove_lv(lv)
-                else:
-                    # just remove all lvm metadata, leaving the LV around
-                    lv.clear_tags()
-
-        terminal.success("Zapping successful for: %s" % ", ".join(args.devices))
+            if device.is_lvm_member:
+                self.zap_lvm_member(device)
+            if device.is_lv:
+                self.zap_lv(device)
+            if device.is_partition:
+                self.zap_partition(device)
+            if device.is_device:
+                self.zap_raw_device(device)
+
+        if self.args.devices:
+            terminal.success(
+                "Zapping successful for: %s" % ", ".join([str(d) for d in self.args.devices])
+            )
+        else:
+            terminal.success(
+                "Zapping successful for OSD: %s" % self.args.osd_id or self.args.osd_fsid
+            )
+
+    @decorators.needs_root
+    def zap_osd(self):
+        if self.args.osd_id:
+            osd_is_running = systemctl.osd_is_active(self.args.osd_id)
+            if osd_is_running:
+                mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
+                mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
+                raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
+        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
+        self.zap(devices)
  
      def dmcrypt_close(self, dmcrypt_uuid):
          dmcrypt_path = "/dev/mapper/{}".format(dmcrypt_uuid)
@@ -155,6 +289,14 @@ class Zap(object):
  
                ceph-volume lvm zap /dev/sda /dev/sdb /db/sdc
  
+          Zapping devices associated with an OSD ID:
+
+              ceph-volume lvm zap --osd-id 1
+
+            Optionally include the OSD FSID
+
+              ceph-volume lvm zap --osd-id 1 --osd-fsid 55BD4219-16A7-4037-BC20-0F158EFCC83D
+
          If the --destroy flag is given and you are zapping a raw device or partition
          then all vgs and lvs that exist on that raw device or partition will be destroyed.
  
@@ -179,17 +321,35 @@ class Zap(object):
              'devices',
              metavar='DEVICES',
              nargs='*',
+            type=arg_validators.ValidDevice(gpt_ok=True),
              default=[],
              help='Path to one or many lv (as vg/lv), partition (as /dev/sda1) or device (as /dev/sda)'
          )
+
          parser.add_argument(
              '--destroy',
              action='store_true',
              default=False,
              help='Destroy all volume groups and logical volumes if you are zapping a raw device or partition',
          )
+
+        parser.add_argument(
+            '--osd-id',
+            help='Specify an OSD ID to detect associated devices for zapping',
+        )
+
+        parser.add_argument(
+            '--osd-fsid',
+            help='Specify an OSD FSID to detect associated devices for zapping',
+        )
+
          if len(self.argv) == 0:
              print(sub_command_help)
              return
-        args = parser.parse_args(self.argv)
-        self.zap(args)
+
+        self.args = parser.parse_args(self.argv)
+
+        if self.args.osd_id or self.args.osd_fsid:
+            self.zap_osd()
+        else:
+            self.zap()
diff --git a/ceph/src/ceph-volume/ceph_volume/inventory/main.py b/ceph/src/ceph-volume/ceph_volume/inventory/main.py

index f4c732cab72f393368879eea24c8513ddbc42f58..1d821b602be18329330a800174e90ab1738074f7 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/inventory/main.py
+++ b/ceph/src/ceph-volume/ceph_volume/inventory/main.py
@@ -1,7 +1,7 @@
  # -*- coding: utf-8 -*-
  
  import argparse
-import pprint
+import json
  
  from ceph_volume.util.device import Devices, Device
  
@@ -39,8 +39,8 @@ class Inventory(object):
  
      def format_report(self, inventory):
          if self.args.format == 'json':
-            print(inventory.json_report())
+            print(json.dumps(inventory.json_report()))
          elif self.args.format == 'json-pretty':
-            pprint.pprint(inventory.json_report())
+            print(json.dumps(inventory.json_report(), indent=4, sort_keys=True))
          else:
              print(inventory.pretty_report())
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py

index cf7dd5d8fc55f2f11817dfe7dc049ccc8037bebb..8ec99bb8469c4f453515affb444c83a94d4f40df 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -192,10 +192,11 @@ def tmpfile(tmpdir):
  
  @pytest.fixture
  def device_info(monkeypatch):
-    def apply(devices=None, lsblk=None, lv=None, blkid=None):
+    def apply(devices=None, lsblk=None, lv=None, blkid=None, udevadm=None):
          devices = devices if devices else {}
          lsblk = lsblk if lsblk else {}
          blkid = blkid if blkid else {}
+        udevadm = udevadm if udevadm else {}
          lv = Factory(**lv) if lv else None
          monkeypatch.setattr("ceph_volume.sys_info.devices", {})
          monkeypatch.setattr("ceph_volume.util.device.disk.get_devices", lambda: devices)
@@ -206,4 +207,5 @@ def device_info(monkeypatch):
          monkeypatch.setattr("ceph_volume.util.device.lvm.get_lv", lambda vg_name, lv_uuid: lv)
          monkeypatch.setattr("ceph_volume.util.device.disk.lsblk", lambda path: lsblk)
          monkeypatch.setattr("ceph_volume.util.device.disk.blkid", lambda path: blkid)
+        monkeypatch.setattr("ceph_volume.util.disk.udevadm_property", lambda *a, **kw: udevadm)
      return apply
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py

index d1f9046a019d15112a886d07fc1a15e616158c57..50ef61b8331e39795277dcc922c2f836b823b3a3 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py
@@ -1,6 +1,13 @@
  from ceph_volume.devices.lvm import batch
  
  
+class TestBatchSmoke(object):
+
+    def test_batch_instance(self, is_root):
+        b = batch.Batch([])
+        b.main()
+
+
  class TestFilterDevices(object):
  
      def test_filter_used_device(self, factory):
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py

new file mode 100644 (file)

index 0000000..55daa4f
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -0,0 +1,153 @@
+import pytest
+from ceph_volume.api import lvm as api
+from ceph_volume.devices.lvm import zap
+
+
+class TestFindAssociatedDevices(object):
+
+    def test_no_lvs_found_that_match_id(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id=10)
+
+    def test_no_lvs_found_that_match_fsid(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_fsid='aaaa-lkjh')
+
+    def test_no_lvs_found_that_match_id_fsid(self, volumes, monkeypatch, device_info):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', vg_name='vg', lv_tags=tags)
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id='9', osd_fsid='aaaa-lkjh')
+
+    def test_no_ceph_lvs_found(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', lv_path='/dev/VolGroup/lv', lv_tags='')
+        volumes.append(osd)
+        with pytest.raises(RuntimeError):
+            zap.find_associated_devices(osd_id=100)
+
+    def test_lv_is_matched_id(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_id='0')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+    def test_lv_is_matched_fsid(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_fsid='asdf-lkjh')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+    def test_lv_is_matched_id_fsid(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.api, 'Volumes', lambda: volumes)
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.find_associated_devices(osd_id='0', osd_fsid='asdf-lkjh')
+        assert result[0].abspath == '/dev/VolGroup/lv'
+
+
+class TestEnsureAssociatedLVs(object):
+
+    def test_nothing_is_found(self, volumes):
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == []
+
+    def test_data_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/data', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/data']
+
+    def test_block_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/block']
+
+    def test_block_and_partition_are_found(self, volumes, monkeypatch):
+        monkeypatch.setattr(zap.disk, 'get_device_from_partuuid', lambda x: '/dev/sdb1')
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/sdb1' in result
+        assert '/dev/VolGroup/block' in result
+
+    def test_journal_is_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+        osd = api.Volume(
+            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert result == ['/dev/VolGroup/lv']
+
+    def test_multiple_journals_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_dbs_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=db'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_wals_are_found(self, volumes):
+        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=wal'
+        for i in range(3):
+            osd = api.Volume(
+                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lv0' in result
+        assert '/dev/VolGroup/lv1' in result
+        assert '/dev/VolGroup/lv2' in result
+
+    def test_multiple_backing_devs_are_found(self, volumes):
+        for _type in ['journal', 'db', 'wal']:
+            tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.wal_uuid=x,ceph.type=%s' % _type
+            osd = api.Volume(
+                lv_name='volume%s' % _type, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % _type, lv_tags=tags)
+            volumes.append(osd)
+        result = zap.ensure_associated_lvs(volumes)
+        assert '/dev/VolGroup/lvjournal' in result
+        assert '/dev/VolGroup/lvwal' in result
+        assert '/dev/VolGroup/lvdb' in result
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py

index 493c74c509c733be891712414f5a1f95cb30e20a..6333e3a4ea1d7ab0b6c3f82774294a053e26d296 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/test_zap.py
@@ -19,7 +19,9 @@ class TestZap(object):
          '/dev/mapper/foo',
          '/dev/dm-0',
      ])
-    def test_can_not_zap_mapper_device(self, capsys, is_root, device_name):
+    def test_can_not_zap_mapper_device(self, monkeypatch, device_info, capsys, is_root, device_name):
+        monkeypatch.setattr('os.path.exists', lambda x: True)
+        device_info()
          with pytest.raises(SystemExit):
              lvm.zap.Zap(argv=[device_name]).main()
          stdout, stderr = capsys.readouterr()
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/mixed-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/bluestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/mixed-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/centos7/filestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml

new file mode 100644 (file)

index 0000000..850ecc9
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/playbooks/test_zap.yml
@@ -0,0 +1,31 @@
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: stop ceph-osd daemons
+      service:
+        name: "ceph-osd@{{ item }}"
+        state: stopped
+      with_items: "{{ osd_ids }}"
+
+
+- hosts: mons
+  become: yes
+  tasks:
+
+    - name: purge osds
+      command: "ceph --cluster {{ cluster }} osd purge osd.{{ item }} --yes-i-really-mean-it"
+      with_items: "{{ osd_ids }}"
+
+
+- hosts: osds
+  become: yes
+  tasks:
+
+    - name: zap devices used for OSDs
+      command: "ceph-volume --cluster {{ cluster }} lvm zap --osd-id {{ item }} --destroy"
+      with_items: "{{ osd_ids }}"
+      environment:
+        CEPH_VOLUME_DEBUG: 1
+
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini

index c2725a09fd0625964a5cb9962fca3a0dd720dcfe..4c3af68115702ea7e12ef89c8304a50bf9261857 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini
@@ -63,4 +63,7 @@ commands=
    # retest to ensure cluster came back up correctly
    testinfra -n 4 --sudo -v --connection=ansible --ansible-inventory={changedir}/hosts {envdir}/tmp/ceph-ansible/tests/functional/tests
  
+  # test zap OSDs by ID
+  ansible-playbook -vv -i {changedir}/hosts {changedir}/test_zap.yml
+
    vagrant destroy {env:VAGRANT_DESTROY_FLAGS:"--force"}
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/bluestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type-dmcrypt/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml

new file mode 120000 (symlink)

index 0000000..cb969fa
--- /dev/null
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/batch/xenial/filestore/single-type/test_zap.yml
@@ -0,0 +1 @@
+../../../playbooks/test_zap.yml
+\ No newline at end of file
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml

index bebe6dc36ba513c8904bb7d0e2927e3121fe388c..8caa1ce38bef9d28f55cdf791aca0ce9bf547780 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/bluestore/dmcrypt/test.yml
@@ -32,6 +32,17 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml

index c48e4becece7d8ace417acde3699b373c0e76d98..17b74d5245120fdfd0b75f17fb09caea30d85bbd 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos7/filestore/dmcrypt/test.yml
@@ -40,6 +40,27 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
        environment:
@@ -56,6 +77,16 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    - name: re-create partition /dev/sdc1
+      parted:
+        device: /dev/sdc
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        state: present
+        label: gpt
+
      - name: prepare osd.0 again using test_group/data-lv1
        command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml

index e4e804a709b54f35b2c15c3c7260ff350d581bd2..353df127cb75addcbc9ec1ace272b6bb795b67b3 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_bluestore.yml
@@ -35,6 +35,17 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
        environment:
@@ -51,6 +62,37 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    - name: find all OSD directories
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: no
+        file_type: directory
+      register: osd_directories
+
+    - name: find all OSD symlinks
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: yes
+        depth: 2
+        file_type: link
+      register: osd_symlinks
+
+    # set the OSD dir and the block/block.db links to root:root permissions, to
+    # ensure that the OSD will be able to activate regardless
+    - file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+      with_items:
+        - "{{ osd_directories.files }}"
+
+    - file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+      with_items:
+        - "{{ osd_symlinks.files }}"
+
      - name: activate all to start the previously prepared osd.0
        command: "ceph-volume lvm activate --all"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml

index 4aa3cf19d2ea8c79af558d09cdc472e0d9e3cdbd..e896c41b0ea6b5de4d66d13fae34a3a0b72f76d1 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml
@@ -41,6 +41,27 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
        environment:
@@ -65,6 +86,34 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    - name: find all OSD paths
+      find:
+        paths: /var/lib/ceph/osd
+        recurse: no
+        file_type: directory
+      register: osd_paths
+
+    # set all OSD paths to root:rootto ensure that the OSD will be able to
+    # activate regardless
+    - name: mangle permissions to root
+      file:
+        path: "{{ item.path }}"
+        owner: root
+        group: root
+        recurse: yes
+      with_items:
+        - "{{ osd_paths.files }}"
+
+    - name: stop ceph-osd@2 daemon
+      service:
+        name: ceph-osd@2
+        state: stopped
+
+    - name: stop ceph-osd@1 daemon
+      service:
+        name: ceph-osd@1
+        state: stopped
+
      - name: activate all to start the previously prepared osd.0
        command: "ceph-volume lvm activate --filestore --all"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml

index 19209b1d2113617102991608166657004b9750c9..3e032e20243baa4355509e55966d643dda51bc0b 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/bluestore/dmcrypt/test.yml
@@ -33,6 +33,17 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --bluestore --data /dev/sdd1 --osd-id 2"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml

index c48e4becece7d8ace417acde3699b373c0e76d98..17b74d5245120fdfd0b75f17fb09caea30d85bbd 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/xenial/filestore/dmcrypt/test.yml
@@ -40,6 +40,27 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    # partitions have been completely removed, so re-create them again
+    - name: re-create partition /dev/sdd for lvm data usage
+      parted:
+        device: /dev/sdd
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        label: gpt
+        state: present
+
+    - name: re-create partition /dev/sdd lvm journals
+      parted:
+        device: /dev/sdd
+        number: 2
+        part_start: 50%
+        part_end: 100%
+        unit: '%'
+        state: present
+        label: gpt
+
      - name: redeploy osd.2 using /dev/sdd1
        command: "ceph-volume --cluster {{ cluster }} lvm create --filestore --data /dev/sdd1 --journal /dev/sdd2 --osd-id 2"
        environment:
@@ -56,6 +77,16 @@
        environment:
          CEPH_VOLUME_DEBUG: 1
  
+    - name: re-create partition /dev/sdc1
+      parted:
+        device: /dev/sdc
+        number: 1
+        part_start: 0%
+        part_end: 50%
+        unit: '%'
+        state: present
+        label: gpt
+
      - name: prepare osd.0 again using test_group/data-lv1
        command: "ceph-volume --cluster {{ cluster }} lvm prepare --filestore --data test_group/data-lv1 --journal /dev/sdc1 --osd-id 0"
        environment:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml b/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml

index 3564cf3cd1f4bb1f8f47026f260e839d9065fc71..f46fcb1d45ea10f78879449269fcb6068d3cbdd7 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml
+++ b/ceph/src/ceph-volume/ceph_volume/tests/functional/playbooks/deploy.yml
@@ -75,10 +75,10 @@
    become: True
    any_errors_fatal: true
    roles:
-    - role: ceph-defaults
-      tags: ['ceph_update_config']
-    - role: ceph-handler
-    - role: ceph-common
+    - ceph-defaults
+    - ceph-facts
+    - ceph-handler
+    - ceph-common
    tasks:
      - name: rsync ceph-volume to test nodes on centos
        synchronize:
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py

index 99e1d494c30f125a8cfd3f3615c3d4cc2587d4ad..8be5f8e4be3dd0c897ea84259ff4ac0e5bf54fab 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py
@@ -18,12 +18,37 @@ class TestDevice(object):
          disk = device.Device("vg/lv")
          assert disk.is_lv
  
-    def test_is_device(self, device_info):
+    def test_vgs_is_empty(self, device_info, pvolumes, monkeypatch):
+        BarPVolume = api.PVolume(pv_name='/dev/sda', pv_uuid="0000", pv_tags={})
+        pvolumes.append(BarPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        lsblk = {"TYPE": "disk"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/nvme0n1")
+        assert disk.vgs == []
+
+    def test_vgs_is_not_empty(self, device_info, pvolumes, monkeypatch):
+        BarPVolume = api.PVolume(vg_name='foo', lv_uuid='111', pv_name='/dev/nvme0n1', pv_uuid="0000", pv_tags={})
+        pvolumes.append(BarPVolume)
+        monkeypatch.setattr(api, 'PVolumes', lambda: pvolumes)
+        lsblk = {"TYPE": "disk"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/nvme0n1")
+        assert len(disk.vgs) == 1
+
+    def test_device_is_device(self, device_info, pvolumes):
          data = {"/dev/sda": {"foo": "bar"}}
          lsblk = {"TYPE": "device"}
          device_info(devices=data, lsblk=lsblk)
          disk = device.Device("/dev/sda")
-        assert disk.is_device
+        assert disk.is_device is True
+
+    def test_disk_is_device(self, device_info, pvolumes):
+        data = {"/dev/sda": {"foo": "bar"}}
+        lsblk = {"TYPE": "disk"}
+        device_info(devices=data, lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_device is True
  
      def test_is_partition(self, device_info, pvolumes):
          data = {"/dev/sda": {"foo": "bar"}}
@@ -51,6 +76,11 @@ class TestDevice(object):
          disk = device.Device("/dev/mapper/foo")
          assert disk.is_mapper
  
+    def test_dm_is_mapper_device(self, device_info):
+        device_info()
+        disk = device.Device("/dev/dm-4")
+        assert disk.is_mapper
+
      def test_is_not_mapper_device(self, device_info):
          device_info()
          disk = device.Device("/dev/sda")
@@ -62,6 +92,14 @@ class TestDevice(object):
          disk = device.Device("/dev/sda")
          assert disk.is_ceph_disk_member
  
+    def test_is_ceph_disk_member_not_available(self, device_info):
+        lsblk = {"PARTLABEL": "ceph data"}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_ceph_disk_member
+        assert not disk.available
+        assert "Used by ceph-disk" in disk.rejected_reasons
+
      def test_is_not_ceph_disk_member_lsblk(self, device_info):
          lsblk = {"PARTLABEL": "gluster partition"}
          device_info(lsblk=lsblk)
@@ -117,6 +155,125 @@ class TestDevice(object):
          disk = device.Device("/dev/sda")
          assert not disk.used_by_ceph
  
+    def test_get_device_id(self, device_info):
+        udev = {k:k for k in ['ID_VENDOR', 'ID_MODEL', 'ID_SCSI_SERIAL']}
+        device_info(udevadm=udev)
+        disk = device.Device("/dev/sda")
+        assert disk._get_device_id() == 'ID_VENDOR_ID_MODEL_ID_SCSI_SERIAL'
+
+
+
+class TestDeviceEncryption(object):
+
+    def test_partition_is_not_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part', 'FSTYPE': 'xfs'}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is False
+
+    def test_partition_is_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part', 'FSTYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is True
+
+    def test_partition_is_not_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part'}
+        blkid = {'TYPE': 'ceph data'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is False
+
+    def test_partition_is_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'part'}
+        blkid = {'TYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_luks1(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'LUKS1'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_luks2(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'LUKS2'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+        status = {'type': 'PLAIN'}
+        monkeypatch.setattr(device, 'encryption_status', lambda x: status)
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is True
+
+    def test_mapper_is_not_encrypted_plain(self, device_info, pvolumes, monkeypatch):
+        monkeypatch.setattr(device, 'encryption_status', lambda x: {})
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/mapper/uuid")
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_blkid(self, device_info, pvolumes):
+        lsblk = {'TYPE': 'lvm'}
+        blkid = {'TYPE': 'crypto_LUKS'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = {}
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_blkid(self, factory, device_info, pvolumes):
+        lsblk = {'TYPE': 'lvm'}
+        blkid = {'TYPE': 'xfs'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=None)
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_lsblk(self, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'crypto_LUKS', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = {}
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_lsblk(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=None)
+        assert disk.is_encrypted is False
+
+    def test_lv_is_encrypted_lvm_api(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=True)
+        assert disk.is_encrypted is True
+
+    def test_lv_is_not_encrypted_lvm_api(self, factory, device_info, pvolumes):
+        lsblk = {'FSTYPE': 'xfs', 'TYPE': 'lvm'}
+        blkid = {'TYPE': 'mapper'}
+        device_info(lsblk=lsblk, blkid=blkid)
+        disk = device.Device("/dev/sda")
+        disk.lv_api = factory(encrypted=False)
+        assert disk.is_encrypted is False
+
  
  class TestDeviceOrdering(object):
  
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py

index 5d1bd82b60217b30a4d04e4c7c1cc2344d97d977..e40c982d1f7e9f7cb2991b6f6f58f8647947d363 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py
@@ -45,6 +45,34 @@ class TestBlkid(object):
          assert result['UUID'] == '62416664-cbaf-40bd-9689-10bd337379c3'
          assert result['TYPE'] == 'xfs'
  
+class TestUdevadmProperty(object):
+
+    def test_good_output(self, stub_call):
+        output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+        stub_call((output, [], 0))
+        result = disk.udevadm_property('dev/sda')
+        assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+        assert result['ID_PART_TABLE_TYPE'] == 'gpt'
+        assert result['ID_SERIAL_SHORT'] == 'MS83N71801150416A'
+
+    def test_property_filter(self, stub_call):
+        output = """ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_SERIAL_SHORT=MS83N71801150416A""".split()
+        stub_call((output, [], 0))
+        result = disk.udevadm_property('dev/sda', ['ID_MODEL',
+                                                   'ID_SERIAL_SHORT'])
+        assert result['ID_MODEL'] == 'SK_hynix_SC311_SATA_512GB'
+        assert 'ID_PART_TABLE_TYPE' not in result
+
+    def test_fail_on_broken_output(self, stub_call):
+        output = ["ID_MODEL:SK_hynix_SC311_SATA_512GB"]
+        stub_call((output, [], 0))
+        with pytest.raises(ValueError):
+            disk.udevadm_property('dev/sda')
+
  
  class TestDeviceFamily(object):
  
@@ -239,6 +267,28 @@ class TestGetDevices(object):
          assert len(result) == 1
          assert result == [ceph_data_path]
  
+    def test_sda1_partition(self, tmpfile, tmpdir):
+        block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
+        block_sda_path = os.path.join(block_path, 'sda')
+        block_sda1_path = os.path.join(block_sda_path, 'sda1')
+        block_sda1_holders = os.path.join(block_sda1_path, 'holders')
+        dev_sda_path = os.path.join(dev_path, 'sda')
+        dev_sda1_path = os.path.join(dev_path, 'sda1')
+        os.makedirs(block_sda_path)
+        os.makedirs(block_sda1_path)
+        os.makedirs(dev_sda1_path)
+        os.makedirs(block_sda1_holders)
+        os.makedirs(dev_sda_path)
+        tmpfile('size', '1024', directory=block_sda_path)
+        tmpfile('partition', '1', directory=block_sda1_path)
+        result = disk.get_devices(
+            _sys_block_path=block_path,
+            _dev_path=dev_path,
+            _mapper_path=mapper_path)
+        assert dev_sda_path in list(result.keys())
+        assert '/dev/sda1' in list(result.keys())
+        assert result['/dev/sda1']['holders'] == []
+
      def test_sda_size(self, tmpfile, tmpdir):
          block_path, dev_path, mapper_path = self.setup_paths(tmpdir)
          block_sda_path = os.path.join(block_path, 'sda')
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py

index 8cca42689b41ec0652a5b818661c5b2b01ca6371..e1420b440d391d63ffa88b21a6ddb3df159b13d6 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_encryption.py
@@ -33,3 +33,21 @@ class TestDmcryptClose(object):
          file_name = '/path/does/not/exist'
          encryption.dmcrypt_close(file_name)
          assert fake_run.calls == []
+
+
+class TestDmcryptKey(object):
+
+    def test_dmcrypt_with_default_size(self, conf_ceph_stub):
+        conf_ceph_stub('[global]\nfsid=asdf-lkjh')
+        result = encryption.create_dmcrypt_key()
+        assert len(result) == 172
+
+    def test_dmcrypt_with_custom_size(self, conf_ceph_stub):
+        conf_ceph_stub('''
+        [global]
+        fsid=asdf
+        [osd]
+        osd_dmcrypt_size=8
+        ''')
+        result = encryption.create_dmcrypt_key()
+        assert len(result) == 172
diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py

index 82f2ef27f647148dc9fb8cf72f41da46a995c7a1..1a094d33f5bcf652b125985085cd99a03f7ed9e6 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py
+++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_util.py
@@ -15,12 +15,27 @@ class TestAsBytes(object):
  
  class TestStrToInt(object):
  
-    def test_passing_a_float_str(self):
-        result = util.str_to_int("1.99")
+    def test_passing_a_float_str_comma(self):
+        result = util.str_to_int("1,99")
          assert result == 1
  
-    def test_passing_a_float_does_not_round(self):
-        result = util.str_to_int("1.99", round_down=False)
+    def test_passing_a_float_does_not_round_comma(self):
+        result = util.str_to_int("1,99", round_down=False)
+        assert result == 2
+
+    @pytest.mark.parametrize("value", ['2', 2])
+    def test_passing_an_int(self, value):
+        result = util.str_to_int(value)
+        assert result == 2
+
+    @pytest.mark.parametrize("value", ['1.99', 1.99])
+    def test_passing_a_float(self, value):
+        result = util.str_to_int(value)
+        assert result == 1
+
+    @pytest.mark.parametrize("value", ['1.99', 1.99])
+    def test_passing_a_float_does_not_round(self, value):
+        result = util.str_to_int(value, round_down=False)
          assert result == 2
  
      def test_text_is_not_an_integer_like(self):
@@ -28,6 +43,11 @@ class TestStrToInt(object):
              util.str_to_int("1.4GB")
          assert str(error.value) == "Unable to convert to integer: '1.4GB'"
  
+    def test_input_is_not_string(self):
+        with pytest.raises(RuntimeError) as error:
+            util.str_to_int(None)
+        assert str(error.value) == "Unable to convert to integer: 'None'"
+
  
  def true_responses(upper_casing=False):
      if upper_casing:
@@ -75,22 +95,22 @@ class TestPromptBool(object):
      def test_trueish(self, response):
          fake_input = lambda x: response
          qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is True
+        assert util.prompt_bool(qx, input_=fake_input) is True
  
      @pytest.mark.parametrize('response', false_responses())
      def test_falseish(self, response):
          fake_input = lambda x: response
          qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is False
+        assert util.prompt_bool(qx, input_=fake_input) is False
  
      def test_try_again_true(self):
          responses = ['g', 'h', 'y']
          fake_input = lambda x: responses.pop(0)
          qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is True
+        assert util.prompt_bool(qx, input_=fake_input) is True
  
      def test_try_again_false(self):
          responses = ['g', 'h', 'n']
          fake_input = lambda x: responses.pop(0)
          qx = 'what the what?'
-        assert util.prompt_bool(qx, _raw_input=fake_input) is False
+        assert util.prompt_bool(qx, input_=fake_input) is False
diff --git a/ceph/src/ceph-volume/ceph_volume/util/__init__.py b/ceph/src/ceph-volume/ceph_volume/util/__init__.py

index cdcf3a5b0d9cb1d85dde52c3214a281aa1fa11c7..43c9c9d6835d11b53070ac6b7386364b9f637a30 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/util/__init__.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/__init__.py
@@ -2,6 +2,10 @@ import logging
  from math import floor
  from ceph_volume import terminal
  
+try:
+    input = raw_input  # pylint: disable=redefined-builtin
+except NameError:
+    pass
  
  logger = logging.getLogger(__name__)
  
@@ -31,10 +35,21 @@ def str_to_int(string, round_down=True):
      """
      Parses a string number into an integer, optionally converting to a float
      and rounding down.
+
+    Some LVM values may come with a comma instead of a dot to define decimals.
+    This function normalizes a comma into a dot
      """
      error_msg = "Unable to convert to integer: '%s'" % str(string)
      try:
-        integer = float(string)
+        integer = float(string.replace(',', '.'))
+    except AttributeError:
+        # this might be a integer already, so try to use it, otherwise raise
+        # the original exception
+        if isinstance(string, (int, float)):
+            integer = string
+        else:
+            logger.exception(error_msg)
+            raise RuntimeError(error_msg)
      except (TypeError, ValueError):
          logger.exception(error_msg)
          raise RuntimeError(error_msg)
@@ -68,12 +83,12 @@ def str_to_bool(val):
          raise ValueError("Invalid input value: %s" % val)
  
  
-def prompt_bool(question, _raw_input=None):
+def prompt_bool(question, input_=None):
      """
      Interface to prompt a boolean (or boolean-like) response from a user.
      Usually a confirmation.
      """
-    input_prompt = _raw_input or raw_input
+    input_prompt = input_ or input
      prompt_format = '--> {question} '.format(question=question)
      response = input_prompt(prompt_format)
      try:
@@ -82,4 +97,4 @@ def prompt_bool(question, _raw_input=None):
          terminal.error('Valid true responses are: y, yes, <Enter>')
          terminal.error('Valid false responses are: n, no')
          terminal.error('That response was invalid, please try again')
-        return prompt_bool(question, _raw_input=input_prompt)
+        return prompt_bool(question, input_=input_prompt)
diff --git a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py

index 534c9aa64673693c617f461ab14c897fc3f553ab..a04c19924bbd0d2b7745bc33f881768d01800a7b 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -8,15 +8,22 @@ from ceph_volume.util.device import Device
  
  class ValidDevice(object):
  
-    def __init__(self, as_string=False):
+    def __init__(self, as_string=False, gpt_ok=False):
          self.as_string = as_string
+        self.gpt_ok = gpt_ok
  
      def __call__(self, string):
          device = Device(string)
          error = None
          if not device.exists:
              error = "Unable to proceed with non-existing device: %s" % string
-        elif device.has_gpt_headers:
+        # FIXME this is not a nice API, this validator was meant to catch any
+        # non-existing devices upfront, not check for gpt headers. Now this
+        # needs to optionally skip checking gpt headers which is beyond
+        # verifying if the device exists. The better solution would be to
+        # configure this with a list of checks that can be excluded/included on
+        # __init__
+        elif device.has_gpt_headers and not self.gpt_ok:
              error = "GPT headers found, they must be removed on: %s" % string
  
          if error:
diff --git a/ceph/src/ceph-volume/ceph_volume/util/device.py b/ceph/src/ceph-volume/ceph_volume/util/device.py

index 1810448869b1b01f2d676ecb6cea36f99f8b3579..06f90cd374417a7fef172e9d1a185b2f19a9d235 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/util/device.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/device.py
@@ -10,6 +10,16 @@ report_template = """
  {dev:<25} {size:<12} {rot!s:<7} {available!s:<9} {model}"""
  
  
+def encryption_status(abspath):
+    """
+    Helper function to run ``encryption.status()``. It is done here to avoid
+    a circular import issue (encryption module imports from this module) and to
+    ease testing by allowing monkeypatching of this function.
+    """
+    from ceph_volume.util import encryption
+    return encryption.status(abspath)
+
+
  class Devices(object):
      """
      A container for Device instances with reporting
@@ -79,6 +89,7 @@ class Device(object):
          self._is_lvm_member = None
          self._parse()
          self.available, self.rejected_reasons = self._check_reject_reasons()
+        self.device_id = self._get_device_id()
  
      def __lt__(self, other):
          '''
@@ -172,6 +183,32 @@ class Device(object):
          output['lvs'] = [lv.report() for lv in self.lvs]
          return output
  
+    def _get_device_id(self):
+        """
+        Please keep this implementation in sync with get_device_id() in
+        src/common/blkdev.cc
+        """
+        props = ['ID_VENDOR','ID_MODEL','ID_SERIAL_SHORT', 'ID_SERIAL',
+                 'ID_SCSI_SERIAL']
+        p = disk.udevadm_property(self.abspath, props)
+        if 'ID_VENDOR' in p and 'ID_MODEL' in p and 'ID_SCSI_SERIAL' in p:
+            dev_id = '_'.join([p['ID_VENDOR'], p['ID_MODEL'],
+                              p['ID_SCSI_SERIAL']])
+        elif 'ID_MODEL' in p and 'ID_SERIAL_SHORT' in p:
+            dev_id = '_'.join([p['ID_MODEL'], p['ID_SERIAL_SHORT']])
+        elif 'ID_SERIAL' in p:
+            dev_id = p['ID_SERIAL']
+            if dev_id.startswith('MTFD'):
+                # Micron NVMes hide the vendor
+                dev_id = 'Micron_' + dev_id
+        else:
+            # the else branch should fallback to using sysfs and ioctl to
+            # retrieve device_id on FreeBSD. Still figuring out if/how the
+            # python ioctl implementation does that on FreeBSD
+            dev_id = ''
+        dev_id.replace(' ', '_')
+        return dev_id
+
      def _set_lvm_membership(self):
          if self._is_lvm_member is None:
              # this is contentious, if a PV is recognized by LVM but has no
@@ -185,6 +222,7 @@ class Device(object):
                  pvs.filter(pv_name=path)
                  has_vgs = [pv.vg_name for pv in pvs if pv.vg_name]
                  if has_vgs:
+                    self.vgs = list(set(has_vgs))
                      # a pv can only be in one vg, so this should be safe
                      self.vg_name = has_vgs[0]
                      self._is_lvm_member = True
@@ -194,6 +232,8 @@ class Device(object):
                              lv = lvm.get_lv(vg_name=pv.vg_name, lv_uuid=pv.lv_uuid)
                              if lv:
                                  self.lvs.append(lv)
+                else:
+                    self.vgs = []
          return self._is_lvm_member
  
      def _get_pv_paths(self):
@@ -239,11 +279,18 @@ class Device(object):
  
      @property
      def is_ceph_disk_member(self):
-        return self.ceph_disk.is_member
+        is_member = self.ceph_disk.is_member
+        if self.sys_api.get("partitions"):
+            for part in self.sys_api.get("partitions").keys():
+                part = Device("/dev/%s" % part)
+                if part.is_ceph_disk_member:
+                    is_member = True
+                    break
+        return is_member
  
      @property
      def is_mapper(self):
-        return self.path.startswith('/dev/mapper')
+        return self.path.startswith(('/dev/mapper', '/dev/dm-'))
  
      @property
      def is_lv(self):
@@ -258,9 +305,40 @@ class Device(object):
      @property
      def is_device(self):
          if self.disk_api:
-            return self.disk_api['TYPE'] == 'device'
+            is_device = self.disk_api['TYPE'] == 'device'
+            is_disk = self.disk_api['TYPE'] == 'disk'
+            if is_device or is_disk:
+                return True
          return False
  
+    @property
+    def is_encrypted(self):
+        """
+        Only correct for LVs, device mappers, and partitions. Will report a ``None``
+        for raw devices.
+        """
+        crypt_reports = [self.blkid_api.get('TYPE', ''), self.disk_api.get('FSTYPE', '')]
+        if self.is_lv:
+            # if disk APIs are reporting this is encrypted use that:
+            if 'crypto_LUKS' in crypt_reports:
+                return True
+            # if ceph-volume created this, then a tag would let us know
+            elif self.lv_api.encrypted:
+                return True
+            return False
+        elif self.is_partition:
+            return 'crypto_LUKS' in crypt_reports
+        elif self.is_mapper:
+            active_mapper = encryption_status(self.abspath)
+            if active_mapper:
+                # normalize a bit to ensure same values regardless of source
+                encryption_type = active_mapper['type'].lower().strip('12')  # turn LUKS1 or LUKS2 into luks
+                return True if encryption_type in ['plain', 'luks'] else False
+            else:
+                return False
+        else:
+            return None
+
      @property
      def used_by_ceph(self):
          # only filter out data devices as journals could potentially be reused
@@ -282,6 +360,9 @@ class Device(object):
          ]
          rejected = [reason for (k, v, reason) in reasons if
                      self.sys_api.get(k, '') == v]
+        if self.is_ceph_disk_member:
+            rejected.append("Used by ceph-disk")
+
          return len(rejected) == 0, rejected
  
  
diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py

index ccc2ff7a15226498e698f0322f3a463989a57558..c85d3be9aaad3525b3f67cefe3dffa17c4ae028d 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/util/disk.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py
@@ -127,6 +127,23 @@ def get_device_from_partuuid(partuuid):
      return ' '.join(out).strip()
  
  
+def remove_partition(device):
+    """
+    Removes a partition using parted
+
+    :param device: A ``Device()`` object
+    """
+    parent_device = '/dev/%s' % device.disk_api['PKNAME']
+    udev_info = udevadm_property(device.abspath)
+    partition_number = udev_info.get('ID_PART_ENTRY_NUMBER')
+    if not partition_number:
+        raise RuntimeError('Unable to detect the partition number for device: %s' % device.abspath)
+
+    process.run(
+        ['parted', parent_device, '--script', '--', 'rm', partition_number]
+    )
+
+
  def _stat_is_device(stat_obj):
      """
      Helper function that will interpret ``os.stat`` output directly, so that other
@@ -170,6 +187,47 @@ def device_family(device):
      return devices
  
  
+def udevadm_property(device, properties=[]):
+    """
+    Query udevadm for information about device properties.
+    Optionally pass a list of properties to return. A requested property might
+    not be returned if not present.
+
+    Expected output format::
+        # udevadm info --query=property --name=/dev/sda                                  :(
+        DEVNAME=/dev/sda
+        DEVTYPE=disk
+        ID_ATA=1
+        ID_BUS=ata
+        ID_MODEL=SK_hynix_SC311_SATA_512GB
+        ID_PART_TABLE_TYPE=gpt
+        ID_PART_TABLE_UUID=c8f91d57-b26c-4de1-8884-0c9541da288c
+        ID_PATH=pci-0000:00:17.0-ata-3
+        ID_PATH_TAG=pci-0000_00_17_0-ata-3
+        ID_REVISION=70000P10
+        ID_SERIAL=SK_hynix_SC311_SATA_512GB_MS83N71801150416A
+        TAGS=:systemd:
+        USEC_INITIALIZED=16117769
+        ...
+    """
+    out = _udevadm_info(device)
+    ret = {}
+    for line in out:
+        p, v = line.split('=', 1)
+        if not properties or p in properties:
+            ret[p] = v
+    return ret
+
+
+def _udevadm_info(device):
+    """
+    Call udevadm and return the output
+    """
+    cmd = ['udevadm', 'info', '--query=property', device]
+    out, _err, _rc = process.call(cmd)
+    return out
+
+
  def lsblk(device, columns=None, abspath=False):
      """
      Create a dictionary of identifying values for a device using ``lsblk``.
@@ -631,7 +689,7 @@ def get_partitions_facts(sys_block_path):
          folder_path = os.path.join(sys_block_path, folder)
          if os.path.exists(os.path.join(folder_path, 'partition')):
              contents = get_file_contents(os.path.join(folder_path, 'partition'))
-            if '1' in contents:
+            if contents:
                  part = {}
                  partname = folder
                  part_sys_block_path = os.path.join(sys_block_path, partname)
@@ -645,6 +703,9 @@ def get_partitions_facts(sys_block_path):
                      part['sectorsize'] = get_file_contents(
                          part_sys_block_path + "/queue/hw_sector_size", 512)
                  part['size'] = human_readable_size(float(part['sectors']) * 512)
+                part['holders'] = []
+                for holder in os.listdir(part_sys_block_path + '/holders'):
+                    part['holders'].append(holder)
  
                  partition_metadata[partname] = part
      return partition_metadata
@@ -754,5 +815,9 @@ def get_devices(_sys_block_path='/sys/block', _dev_path='/dev', _mapper_path='/d
          metadata['path'] = diskname
          metadata['locked'] = is_locked_raw_device(metadata['path'])
  
+        for part_name, part_metadata in metadata['partitions'].items():
+            part_abspath = '/dev/%s' % part_name
+            device_facts[part_abspath] = part_metadata
+
          device_facts[diskname] = metadata
      return device_facts
diff --git a/ceph/src/ceph-volume/ceph_volume/util/encryption.py b/ceph/src/ceph-volume/ceph_volume/util/encryption.py

index f6e3fdd7ec153d2854503da9d57618bc2b916d79..e2b3ca1646423287b9babd3dae958393758e22e1 100644 (file)
--- a/ceph/src/ceph-volume/ceph_volume/util/encryption.py
+++ b/ceph/src/ceph-volume/ceph_volume/util/encryption.py
@@ -23,7 +23,7 @@ def create_dmcrypt_key():
      )
      # The size of the key is defined in bits, so we must transform that
      # value to bytes (dividing by 8) because we read in bytes, not bits
-    random_string = os.urandom(dmcrypt_key_size / 8)
+    random_string = os.urandom(int(dmcrypt_key_size / 8))
      key = base64.b64encode(random_string).decode('utf-8')
      return key
  
@@ -60,6 +60,7 @@ def plain_open(key, device, mapping):
          'cryptsetup',
          '--key-file',
          '-',
+        '--allow-discards',  # allow discards (aka TRIM) requests for device
          'open',
          device,
          mapping,
@@ -84,6 +85,7 @@ def luks_open(key, device, mapping):
          'cryptsetup',
          '--key-file',
          '-',
+        '--allow-discards',  # allow discards (aka TRIM) requests for device
          'luksOpen',
          device,
          mapping,
diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc

index fc99ad53bef945acddac43cc971a78a4e17319bf..9f78b24a5f8b33bb84a1454cf811c845d0a1553a 100644 (file)
--- a/ceph/src/client/Client.cc
+++ b/ceph/src/client/Client.cc
@@ -456,6 +456,7 @@ void Client::dump_status(Formatter *f)
      f->dump_int("mds_epoch", mdsmap->get_epoch());
      f->dump_int("osd_epoch", osd_epoch);
      f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
+    f->dump_bool("blacklisted", blacklisted);
    }
  }
  
@@ -2475,6 +2476,12 @@ void Client::handle_osd_map(MOSDMap *m)
          return o.is_blacklisted(myaddr);});
    }
  
+  // Always subscribe to next osdmap for blacklisted client
+  // until this client is not blacklisted.
+  if (blacklisted) {
+    objecter->maybe_request_map();
+  }
+
    if (objecter->osdmap_full_flag()) {
      _handle_full_flag(-1);
    } else {
@@ -2611,13 +2618,14 @@ void Client::handle_fs_map_user(MFSMapUser *m)
  
  void Client::handle_mds_map(MMDSMap* m)
  {
+  mds_gid_t old_inc, new_inc;
    if (m->get_epoch() <= mdsmap->get_epoch()) {
      ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
                    << " is identical to or older than our "
                    << mdsmap->get_epoch() << dendl;
      m->put();
      return;
-  }  
+  }
  
    ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
  
@@ -2664,6 +2672,13 @@ void Client::handle_mds_map(MMDSMap* m)
      if (!mdsmap->is_up(mds)) {
        session->con->mark_down();
      } else if (mdsmap->get_inst(mds) != session->inst) {
+      old_inc = oldmap->get_incarnation(mds);
+      new_inc = mdsmap->get_incarnation(mds);
+      if (old_inc != new_inc) {
+        ldout(cct, 1) << "mds incarnation changed from "
+                     << old_inc << " to " << new_inc << dendl;
+        oldstate = MDSMap::STATE_NULL;
+      }
        session->con->mark_down();
        session->inst = mdsmap->get_inst(mds);
        // When new MDS starts to take over, notify kernel to trim unused entries
@@ -2674,6 +2689,11 @@ void Client::handle_mds_map(MMDSMap* m)
        continue;  // no change
      
      session->mds_state = newstate;
+    if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
+      // missed reconnect close the session so that it can be reopened
+      _closed_mds_session(session);
+      continue;
+    }
      if (newstate == MDSMap::STATE_RECONNECT) {
        session->con = messenger->get_connection(session->inst);
        send_reconnect(session);
@@ -4862,7 +4882,6 @@ void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
           tcap->cap_id = m->peer.cap_id;
           tcap->seq = m->peer.seq - 1;
           tcap->issue_seq = tcap->seq;
-         tcap->mseq = m->peer.mseq;
           tcap->issued |= cap->issued;
           tcap->implemented |= cap->issued;
           if (cap == in->auth_cap)
@@ -9196,6 +9215,8 @@ int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, in
  int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
                    const struct iovec *iov, int iovcnt)
  {
+  uint64_t fpos = 0;
+
    if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
      return -EFBIG;
  
@@ -9235,7 +9256,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
        }
      }
      offset = f->pos;
-    f->pos = offset+size;
+    fpos = offset+size;
      unlock_fh_pos(f);
    }
  
@@ -9385,6 +9406,11 @@ success:
    lat -= start;
    logger->tinc(l_c_wrlat, lat);
  
+  if (fpos) {
+    lock_fh_pos(f);
+    f->pos = fpos;
+    unlock_fh_pos(f);
+  }
    totalwritten = size;
    r = (int)totalwritten;
  
diff --git a/ceph/src/cls/lock/cls_lock.cc b/ceph/src/cls/lock/cls_lock.cc

index 6e2ae4bbd1d1699570f0fd2745b640d02c55b835..1dab0dd72aca1ab6ab5e6eadceb6308cd45e65eb 100644 (file)
--- a/ceph/src/cls/lock/cls_lock.cc
+++ b/ceph/src/cls/lock/cls_lock.cc
@@ -34,7 +34,18 @@ CLS_NAME(lock)
  
  #define LOCK_PREFIX    "lock."
  
-static int read_lock(cls_method_context_t hctx, const string& name, lock_info_t *lock)
+static int clean_lock(cls_method_context_t hctx)
+{
+  int r = cls_cxx_remove(hctx);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+static int read_lock(cls_method_context_t hctx,
+                    const string& name,
+                    lock_info_t *lock)
  {
    bufferlist bl;
    string key = LOCK_PREFIX;
@@ -67,16 +78,20 @@ static int read_lock(cls_method_context_t hctx, const string& name, lock_info_t
    map<locker_id_t, locker_info_t>::iterator iter = lock->lockers.begin();
  
    while (iter != lock->lockers.end()) {
-    map<locker_id_t, locker_info_t>::iterator next = iter;
-    ++next;
-
      struct locker_info_t& info = iter->second;
      if (!info.expiration.is_zero() && info.expiration < now) {
        CLS_LOG(20, "expiring locker");
-      lock->lockers.erase(iter);
+      iter = lock->lockers.erase(iter);
+    } else {
+      ++iter;
      }
+  }
  
-    iter = next;
+  if (lock->lockers.empty() && cls_lock_is_ephemeral(lock->lock_type)) {
+    r = clean_lock(hctx);
+    if (r < 0) {
+      CLS_ERR("error, on read, cleaning lock object %s", cpp_strerror(r).c_str());
+    }
    }
  
    return 0;
@@ -121,24 +136,35 @@ static int lock_obj(cls_method_context_t hctx,
                      const string& cookie,
                      const string& tag)
  {
-  bool exclusive = lock_type == LOCK_EXCLUSIVE;
+  bool exclusive = cls_lock_is_exclusive(lock_type);
    lock_info_t linfo;
-  bool fail_if_exists = (flags & LOCK_FLAG_RENEW) == 0;
+  bool fail_if_exists = (flags & LOCK_FLAG_MAY_RENEW) == 0;
+  bool fail_if_does_not_exist = flags & LOCK_FLAG_MUST_RENEW;
  
-  CLS_LOG(20, "requested lock_type=%s fail_if_exists=%d", cls_lock_type_str(lock_type), fail_if_exists);
-  if (lock_type != LOCK_EXCLUSIVE &&
-      lock_type != LOCK_SHARED)
+  CLS_LOG(20,
+         "requested lock_type=%s fail_if_exists=%d fail_if_does_not_exist=%d",
+         cls_lock_type_str(lock_type), fail_if_exists, fail_if_does_not_exist);
+  if (!cls_lock_is_valid(lock_type)) {
      return -EINVAL;
+  }
  
    if (name.empty())
      return -EINVAL;
  
+  if (!fail_if_exists && fail_if_does_not_exist) {
+    // at most one of LOCK_FLAG_MAY_RENEW and LOCK_FLAG_MUST_RENEW may
+    // be set since they have different implications if the lock does
+    // not already exist
+    return -EINVAL;
+  }
+
    // see if there's already a locker
    int r = read_lock(hctx, name, &linfo);
    if (r < 0 && r != -ENOENT) {
      CLS_ERR("Could not read lock info: %s", cpp_strerror(r).c_str());
      return r;
    }
+
    map<locker_id_t, locker_info_t>& lockers = linfo.lockers;
    map<locker_id_t, locker_info_t>::iterator iter;
  
@@ -160,11 +186,13 @@ static int lock_obj(cls_method_context_t hctx,
    CLS_LOG(20, "existing_lock_type=%s", cls_lock_type_str(existing_lock_type));
    iter = lockers.find(id);
    if (iter != lockers.end()) {
-    if (fail_if_exists) {
+    if (fail_if_exists && !fail_if_does_not_exist) {
        return -EEXIST;
      } else {
        lockers.erase(iter); // remove old entry
      }
+  } else if (fail_if_does_not_exist) {
+    return -ENOENT;
    }
  
    if (!lockers.empty()) {
@@ -235,9 +263,9 @@ static int lock_op(cls_method_context_t hctx,
   *  entity or cookie is wrong), or -errno on other error.
   */
  static int remove_lock(cls_method_context_t hctx,
-                const string& name,
-                entity_name_t& locker,
-                const string& cookie)
+                      const string& name,
+                      entity_name_t& locker,
+                      const string& cookie)
  {
    // get current lockers
    lock_info_t linfo;
@@ -257,7 +285,12 @@ static int remove_lock(cls_method_context_t hctx,
    }
    lockers.erase(iter);
  
-  r = write_lock(hctx, name, linfo);
+  if (cls_lock_is_ephemeral(linfo.lock_type)) {
+    ceph_assert(lockers.empty());
+    r = clean_lock(hctx);
+  } else {
+    r = write_lock(hctx, name, linfo);
+  }
  
    return r;
  }
@@ -301,7 +334,7 @@ static int unlock_op(cls_method_context_t hctx,
   * is wrong), or -errno on other (unexpected) error.
   */
  static int break_lock(cls_method_context_t hctx,
-               bufferlist *in, bufferlist *out)
+                     bufferlist *in, bufferlist *out)
  {
    CLS_LOG(20, "break_lock");
    cls_lock_break_op op;
@@ -421,7 +454,7 @@ int assert_locked(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
      return -EINVAL;
    }
  
-  if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+  if (!cls_lock_is_valid(op.type)) {
      return -EINVAL;
    }
  
@@ -493,7 +526,7 @@ int set_cookie(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
      return -EINVAL;
    }
  
-  if (op.type != LOCK_EXCLUSIVE && op.type != LOCK_SHARED) {
+  if (!cls_lock_is_valid(op.type)) {
      return -EINVAL;
    }
  
diff --git a/ceph/src/cls/lock/cls_lock_client.cc b/ceph/src/cls/lock/cls_lock_client.cc

index 3a3cef367947875b2b5b7db29d52a9412a6d0e99..776fe4889e86fb3f9f5602f2ba910f1fe206bdbb 100644 (file)
--- a/ceph/src/cls/lock/cls_lock_client.cc
+++ b/ceph/src/cls/lock/cls_lock_client.cc
@@ -208,14 +208,19 @@ namespace rados {
          rados_op->exec("lock", "set_cookie", in);
        }
  
+      void Lock::assert_locked_shared(ObjectOperation *op)
+      {
+        assert_locked(op, name, LOCK_SHARED, cookie, tag);
+      }
+
        void Lock::assert_locked_exclusive(ObjectOperation *op)
        {
          assert_locked(op, name, LOCK_EXCLUSIVE, cookie, tag);
        }
  
-      void Lock::assert_locked_shared(ObjectOperation *op)
+      void Lock::assert_locked_exclusive_ephemeral(ObjectOperation *op)
        {
-        assert_locked(op, name, LOCK_SHARED, cookie, tag);
+        assert_locked(op, name, LOCK_EXCLUSIVE_EPHEMERAL, cookie, tag);
        }
  
        void Lock::lock_shared(ObjectWriteOperation *op)
@@ -242,6 +247,18 @@ namespace rados {
                      cookie, tag, description, duration, flags);
        }
  
+      void Lock::lock_exclusive_ephemeral(ObjectWriteOperation *op)
+      {
+        lock(op, name, LOCK_EXCLUSIVE_EPHEMERAL,
+             cookie, tag, description, duration, flags);
+      }
+
+      int Lock::lock_exclusive_ephemeral(IoCtx *ioctx, const string& oid)
+      {
+        return lock(ioctx, oid, name, LOCK_EXCLUSIVE_EPHEMERAL,
+                    cookie, tag, description, duration, flags);
+      }
+
        void Lock::unlock(ObjectWriteOperation *op)
        {
         rados::cls::lock::unlock(op, name, cookie);
diff --git a/ceph/src/cls/lock/cls_lock_client.h b/ceph/src/cls/lock/cls_lock_client.h

index 7aa06238f179d18640dce30fad79df0b65f919de..0066dc3c0d9f1b8e3bfc65c25745d2e8e77ea7c4 100644 (file)
--- a/ceph/src/cls/lock/cls_lock_client.h
+++ b/ceph/src/cls/lock/cls_lock_client.h
@@ -4,6 +4,8 @@
  #ifndef CEPH_CLS_LOCK_CLIENT_H
  #define CEPH_CLS_LOCK_CLIENT_H
  
+#include <chrono>
+
  #include "cls/lock/cls_lock_types.h"
  
  namespace librados {
@@ -87,26 +89,53 @@ namespace rados {
         void set_tag(const std::string& t) { tag = t; }
         void set_description(const std::string& desc) { description = desc; }
         void set_duration(const utime_t& e) { duration = e; }
-       void set_renew(bool renew) {
+       void set_duration(const ceph::timespan& d) {
+         duration = utime_t(ceph::real_clock::time_point::min() + d);
+       }
+
+       void set_may_renew(bool renew) {
           if (renew) {
-           flags |= LOCK_FLAG_RENEW;
+           flags |= LOCK_FLAG_MAY_RENEW;
+           flags &= ~LOCK_FLAG_MUST_RENEW; // if may then not must
           } else {
-           flags &= ~LOCK_FLAG_RENEW;
+           flags &= ~LOCK_FLAG_MAY_RENEW;
+         }
+       }
+
+       void set_must_renew(bool renew) {
+         if (renew) {
+           flags |= LOCK_FLAG_MUST_RENEW;
+           flags &= ~LOCK_FLAG_MAY_RENEW; // if must then not may
+         } else {
+           flags &= ~LOCK_FLAG_MUST_RENEW;
           }
         }
  
-        void assert_locked_exclusive(librados::ObjectOperation *rados_op);
          void assert_locked_shared(librados::ObjectOperation *rados_op);
+        void assert_locked_exclusive(librados::ObjectOperation *rados_op);
+        void assert_locked_exclusive_ephemeral(librados::ObjectOperation *rados_op);
  
         /* ObjectWriteOperation */
-       void lock_exclusive(librados::ObjectWriteOperation *ioctx);
         void lock_shared(librados::ObjectWriteOperation *ioctx);
+       void lock_exclusive(librados::ObjectWriteOperation *ioctx);
+
+       // Be careful when using an exclusive ephemeral lock; it is
+       // intended strictly for cases when a lock object exists
+       // solely for a lock in a given process and the object is no
+       // longer needed when the lock is unlocked or expired, as the
+       // cls back-end will make an effort to delete it.
+       void lock_exclusive_ephemeral(librados::ObjectWriteOperation *ioctx);
         void unlock(librados::ObjectWriteOperation *ioctx);
-       void break_lock(librados::ObjectWriteOperation *ioctx, const entity_name_t& locker);
+       void break_lock(librados::ObjectWriteOperation *ioctx,
+                       const entity_name_t& locker);
  
         /* IoCtx */
-       int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
         int lock_shared(librados::IoCtx *ioctx, const std::string& oid);
+       int lock_exclusive(librados::IoCtx *ioctx, const std::string& oid);
+
+       // NB: see above comment on exclusive ephemeral locks
+       int lock_exclusive_ephemeral(librados::IoCtx *ioctx,
+                                    const std::string& oid);
         int unlock(librados::IoCtx *ioctx, const std::string& oid);
         int break_lock(librados::IoCtx *ioctx, const std::string& oid,
                        const entity_name_t& locker);
diff --git a/ceph/src/cls/lock/cls_lock_ops.cc b/ceph/src/cls/lock/cls_lock_ops.cc

index 10d00590093c1ac15f7d96116cbbd9763d463f6b..96a2b1ae5766dfeb931f373ab84ec0aff6219e87 100644 (file)
--- a/ceph/src/cls/lock/cls_lock_ops.cc
+++ b/ceph/src/cls/lock/cls_lock_ops.cc
@@ -45,7 +45,7 @@ void cls_lock_lock_op::generate_test_instances(list<cls_lock_lock_op*>& o)
    i->tag = "tag";
    i->description = "description";
    i->duration = utime_t(5, 0);
-  i->flags = LOCK_FLAG_RENEW;
+  i->flags = LOCK_FLAG_MAY_RENEW;
    o.push_back(i);
    o.push_back(new cls_lock_lock_op);
  }
diff --git a/ceph/src/cls/lock/cls_lock_ops.h b/ceph/src/cls/lock/cls_lock_ops.h

index dbdddfe21407d5182673e4599c689f630fc76dec..b9388e78877c73ddc61801f069ce9f8505c57f49 100644 (file)
--- a/ceph/src/cls/lock/cls_lock_ops.h
+++ b/ceph/src/cls/lock/cls_lock_ops.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
  #ifndef CEPH_CLS_LOCK_OPS_H
  #define CEPH_CLS_LOCK_OPS_H
  
diff --git a/ceph/src/cls/lock/cls_lock_types.h b/ceph/src/cls/lock/cls_lock_types.h

index 36d39c89014c67c596cf443bf1c7e3e772a7264b..5f44126b4879d4134d99cfd4104055fd189602f9 100644 (file)
--- a/ceph/src/cls/lock/cls_lock_types.h
+++ b/ceph/src/cls/lock/cls_lock_types.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
  #ifndef CEPH_CLS_LOCK_TYPES_H
  #define CEPH_CLS_LOCK_TYPES_H
  
@@ -7,12 +10,14 @@
  #include "msg/msg_types.h"
  
  /* lock flags */
-#define LOCK_FLAG_RENEW 0x1        /* idempotent lock acquire */
+#define LOCK_FLAG_MAY_RENEW 0x1    /* idempotent lock acquire */
+#define LOCK_FLAG_MUST_RENEW 0x2   /* lock must already be acquired */
  
  enum ClsLockType {
-  LOCK_NONE      = 0,
-  LOCK_EXCLUSIVE = 1,
-  LOCK_SHARED    = 2,
+  LOCK_NONE                = 0,
+  LOCK_EXCLUSIVE           = 1,
+  LOCK_SHARED              = 2,
+  LOCK_EXCLUSIVE_EPHEMERAL = 3, /* lock object is removed @ unlock */
  };
  
  static inline const char *cls_lock_type_str(ClsLockType type)
@@ -24,11 +29,27 @@ static inline const char *cls_lock_type_str(ClsLockType type)
         return "exclusive";
        case LOCK_SHARED:
         return "shared";
+      case LOCK_EXCLUSIVE_EPHEMERAL:
+       return "exclusive-ephemeral";
        default:
         return "<unknown>";
      }
  }
  
+inline bool cls_lock_is_exclusive(ClsLockType type) {
+  return LOCK_EXCLUSIVE == type || LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_ephemeral(ClsLockType type) {
+  return LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
+inline bool cls_lock_is_valid(ClsLockType type) {
+  return LOCK_SHARED == type ||
+    LOCK_EXCLUSIVE == type ||
+    LOCK_EXCLUSIVE_EPHEMERAL == type;
+}
+
  namespace rados {
    namespace cls {
      namespace lock {
diff --git a/ceph/src/cls/rgw/cls_rgw.cc b/ceph/src/cls/rgw/cls_rgw.cc

index 13b3e92dc9598d80de7fd0f084990eebd7c4fe7a..fba47d4601a10136726d54e186dc834c9db2489f 100644 (file)
--- a/ceph/src/cls/rgw/cls_rgw.cc
+++ b/ceph/src/cls/rgw/cls_rgw.cc
@@ -1437,13 +1437,15 @@ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, buffer
        if (ret < 0) {
          return ret;
        }
+    }
+
+    removing = existed && op.delete_marker;
+    if (!removing) {
        ret = other_obj.unlink();
        if (ret < 0) {
          return ret;
        }
      }
-
-    removing = existed && op.delete_marker;
    } else {
      removing = (existed && !obj.is_delete_marker() && op.delete_marker);
    }
@@ -3758,7 +3760,7 @@ static int rgw_set_bucket_resharding(cls_method_context_t hctx, bufferlist *in,
  
  static int rgw_clear_bucket_resharding(cls_method_context_t hctx, bufferlist *in,  bufferlist *out)
  {
-  cls_rgw_set_bucket_resharding_op op;
+  cls_rgw_clear_bucket_resharding_op op;
  
    bufferlist::iterator in_iter = in->begin();
    try {
diff --git a/ceph/src/cls/rgw/cls_rgw_client.cc b/ceph/src/cls/rgw/cls_rgw_client.cc

index 3c4ed919a6de243a20d8d30978099aecb10d43c8..93ef2b522d16d66ea8ebeb5094a8a038d62ae6cb 100644 (file)
--- a/ceph/src/cls/rgw/cls_rgw_client.cc
+++ b/ceph/src/cls/rgw/cls_rgw_client.cc
@@ -92,14 +92,16 @@ bool BucketIndexAioManager::wait_for_completions(int valid_ret_code,
    return true;
  }
  
-void cls_rgw_bucket_init(ObjectWriteOperation& o)
+// note: currently only called by tesing code
+void cls_rgw_bucket_init_index(ObjectWriteOperation& o)
  {
    bufferlist in;
    o.exec(RGW_CLASS, RGW_BUCKET_INIT_INDEX, in);
  }
  
  static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx,
-    const string& oid, BucketIndexAioManager *manager) {
+                                      const string& oid,
+                                      BucketIndexAioManager *manager) {
    bufferlist in;
    librados::ObjectWriteOperation op;
    op.create(true);
@@ -107,6 +109,15 @@ static bool issue_bucket_index_init_op(librados::IoCtx& io_ctx,
    return manager->aio_operate(io_ctx, oid, &op);
  }
  
+static bool issue_bucket_index_clean_op(librados::IoCtx& io_ctx,
+                                       const string& oid,
+                                       BucketIndexAioManager *manager) {
+  bufferlist in;
+  librados::ObjectWriteOperation op;
+  op.remove();
+  return manager->aio_operate(io_ctx, oid, &op);
+}
+
  static bool issue_bucket_set_tag_timeout_op(librados::IoCtx& io_ctx,
      const string& oid, uint64_t timeout, BucketIndexAioManager *manager) {
    bufferlist in;
@@ -126,11 +137,16 @@ int CLSRGWIssueBucketIndexInit::issue_op(int shard_id, const string& oid)
  void CLSRGWIssueBucketIndexInit::cleanup()
  {
    // Do best effort removal
-  for (map<int, string>::iterator citer = objs_container.begin(); citer != iter; ++citer) {
+  for (auto citer = objs_container.begin(); citer != iter; ++citer) {
      io_ctx.remove(citer->second);
    }
  }
  
+int CLSRGWIssueBucketIndexClean::issue_op(int shard_id, const string& oid)
+{
+  return issue_bucket_index_clean_op(io_ctx, oid, &manager);
+}
+
  int CLSRGWIssueSetTagTimeout::issue_op(int shard_id, const string& oid)
  {
    return issue_bucket_set_tag_timeout_op(io_ctx, oid, tag_timeout, &manager);
@@ -956,4 +972,3 @@ int CLSRGWIssueSetBucketResharding::issue_op(int shard_id, const string& oid)
  {
    return issue_set_bucket_resharding(io_ctx, oid, entry, &manager);
  }
-
diff --git a/ceph/src/cls/rgw/cls_rgw_client.h b/ceph/src/cls/rgw/cls_rgw_client.h

index c4ab0f648a5c2ac922142a2de2c8a9c522ec3f2a..97a950cf0242ca30dd5817b60d2a489f038e643c 100644 (file)
--- a/ceph/src/cls/rgw/cls_rgw_client.h
+++ b/ceph/src/cls/rgw/cls_rgw_client.h
@@ -1,3 +1,6 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
  #ifndef CEPH_CLS_RGW_CLIENT_H
  #define CEPH_CLS_RGW_CLIENT_H
  
@@ -230,7 +233,7 @@ public:
  };
  
  /* bucket index */
-void cls_rgw_bucket_init(librados::ObjectWriteOperation& o);
+void cls_rgw_bucket_init_index(librados::ObjectWriteOperation& o);
  
  class CLSRGWConcurrentIO {
  protected:
@@ -252,9 +255,15 @@ protected:
    virtual void reset_container(map<int, string>& objs) {}
  
  public:
-  CLSRGWConcurrentIO(librados::IoCtx& ioc, map<int, string>& _objs_container,
-                     uint32_t _max_aio) : io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio) {}
-  virtual ~CLSRGWConcurrentIO() {}
+
+  CLSRGWConcurrentIO(librados::IoCtx& ioc,
+                    map<int, string>& _objs_container,
+                    uint32_t _max_aio) :
+  io_ctx(ioc), objs_container(_objs_container), max_aio(_max_aio)
+  {}
+
+  virtual ~CLSRGWConcurrentIO()
+  {}
  
    int operator()() {
      int ret = 0;
@@ -305,6 +314,23 @@ public:
      CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio) {}
  };
  
+
+class CLSRGWIssueBucketIndexClean : public CLSRGWConcurrentIO {
+protected:
+  int issue_op(int shard_id, const string& oid) override;
+  int valid_ret_code() override {
+    return -ENOENT;
+  }
+
+public:
+  CLSRGWIssueBucketIndexClean(librados::IoCtx& ioc,
+                             map<int, string>& _bucket_objs,
+                             uint32_t _max_aio) :
+  CLSRGWConcurrentIO(ioc, _bucket_objs, _max_aio)
+  {}
+};
+
+
  class CLSRGWIssueSetTagTimeout : public CLSRGWConcurrentIO {
    uint64_t tag_timeout;
  protected:
@@ -536,7 +562,7 @@ int cls_rgw_reshard_get(librados::IoCtx& io_ctx, const string& oid, cls_rgw_resh
  int cls_rgw_reshard_get_head(librados::IoCtx& io_ctx, const string& oid, cls_rgw_reshard_entry& entry);
  void cls_rgw_reshard_remove(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry);
  
-/* resharding attribute  */
+/* resharding attribute on bucket index shard headers */
  int cls_rgw_set_bucket_resharding(librados::IoCtx& io_ctx, const string& oid,
                                    const cls_rgw_bucket_instance_entry& entry);
  int cls_rgw_clear_bucket_resharding(librados::IoCtx& io_ctx, const string& oid);
diff --git a/ceph/src/cls/rgw/cls_rgw_types.h b/ceph/src/cls/rgw/cls_rgw_types.h

index 51107c3259521e1bac11c27b783cd2b677930230..baa61c9fbd363bf7fcd7ab5e1ca2e45c63ba738e 100644 (file)
--- a/ceph/src/cls/rgw/cls_rgw_types.h
+++ b/ceph/src/cls/rgw/cls_rgw_types.h
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
  #ifndef CEPH_CLS_RGW_TYPES_H
  #define CEPH_CLS_RGW_TYPES_H
  
@@ -609,6 +612,24 @@ enum cls_rgw_reshard_status {
    CLS_RGW_RESHARD_DONE        = 2,
  };
  
+static inline std::string to_string(const enum cls_rgw_reshard_status status)
+{
+  switch (status) {
+  case CLS_RGW_RESHARD_NONE:
+    return "CLS_RGW_RESHARD_NONE";
+    break;
+  case CLS_RGW_RESHARD_IN_PROGRESS:
+    return "CLS_RGW_RESHARD_IN_PROGRESS";
+    break;
+  case CLS_RGW_RESHARD_DONE:
+    return "CLS_RGW_RESHARD_DONE";
+    break;
+  default:
+    break;
+  };
+  return "Unknown reshard status";
+}
+
  struct cls_rgw_bucket_instance_entry {
    cls_rgw_reshard_status reshard_status{CLS_RGW_RESHARD_NONE};
    string new_bucket_instance_id;
diff --git a/ceph/src/common/Cond.h b/ceph/src/common/Cond.h

index aa53b60f2349e16ff488b9fcf6f827d564412041..1777827e3691a3cbc318e455d3b1930e5c8744c7 100644 (file)
--- a/ceph/src/common/Cond.h
+++ b/ceph/src/common/Cond.h
@@ -17,105 +17,7 @@
  #define CEPH_COND_H
  
  #include "include/Context.h"
-
-class Cond {
-  // my bits
-  pthread_cond_t _c;
-
-  Mutex *waiter_mutex;
-
-  // don't allow copying.
-  void operator=(Cond &C);
-  Cond(const Cond &C);
-
- public:
-  Cond() : waiter_mutex(NULL) {
-    int r = pthread_cond_init(&_c,NULL);
-    assert(r == 0);
-  }
-  virtual ~Cond() { 
-    pthread_cond_destroy(&_c); 
-  }
-
-  int Wait(Mutex &mutex)  { 
-    // make sure this cond is used with one mutex only
-    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
-    waiter_mutex = &mutex;
-
-    assert(mutex.is_locked());
-
-    mutex._pre_unlock();
-    int r = pthread_cond_wait(&_c, &mutex._m);
-    mutex._post_lock();
-    return r;
-  }
-
-  int WaitUntil(Mutex &mutex, utime_t when) {
-    // make sure this cond is used with one mutex only
-    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
-    waiter_mutex = &mutex;
-
-    assert(mutex.is_locked());
-
-    struct timespec ts;
-    when.to_timespec(&ts);
-
-    mutex._pre_unlock();
-    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
-    mutex._post_lock();
-
-    return r;
-  }
-
-  int WaitInterval(Mutex &mutex, utime_t interval) {
-    utime_t when = ceph_clock_now();
-    when += interval;
-    return WaitUntil(mutex, when);
-  }
-
-  template<typename Duration>
-  int WaitInterval(Mutex &mutex, Duration interval) {
-    ceph::real_time when(ceph::real_clock::now());
-    when += interval;
-
-    struct timespec ts = ceph::real_clock::to_timespec(when);
-
-    mutex._pre_unlock();
-    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
-    mutex._post_lock();
-
-    return r;
-  }
-
-  int SloppySignal() { 
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-  int Signal() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-          waiter_mutex->is_locked());
-
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-  int SignalOne() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-          waiter_mutex->is_locked());
-
-    int r = pthread_cond_signal(&_c);
-    return r;
-  }
-  int SignalAll() { 
-    // make sure signaler is holding the waiter's lock.
-    assert(waiter_mutex == NULL ||
-          waiter_mutex->is_locked());
-
-    int r = pthread_cond_broadcast(&_c);
-    return r;
-  }
-};
+#include "CondVar.h"
  
  /**
   * context to signal a cond
diff --git a/ceph/src/common/CondVar.h b/ceph/src/common/CondVar.h

new file mode 100644 (file)

index 0000000..c193b99
--- /dev/null
+++ b/ceph/src/common/CondVar.h
@@ -0,0 +1,109 @@
+#ifndef CEPH_COND_VAR_H
+#define CEPH_COND_VAR_H
+
+#include "include/utime.h"
+
+#include "Clock.h"
+#include "Mutex.h"
+#include "pthread.h"
+
+class Cond {
+  // my bits
+  pthread_cond_t _c;
+
+  Mutex *waiter_mutex;
+
+  // don't allow copying.
+  void operator=(Cond &C);
+  Cond(const Cond &C);
+
+ public:
+  Cond() : waiter_mutex(NULL) {
+    int r = pthread_cond_init(&_c,NULL);
+    assert(r == 0);
+  }
+  virtual ~Cond() { 
+    pthread_cond_destroy(&_c); 
+  }
+
+  int Wait(Mutex &mutex)  { 
+    // make sure this cond is used with one mutex only
+    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    assert(mutex.is_locked());
+
+    mutex._pre_unlock();
+    int r = pthread_cond_wait(&_c, &mutex._m);
+    mutex._post_lock();
+    return r;
+  }
+
+  int WaitUntil(Mutex &mutex, utime_t when) {
+    // make sure this cond is used with one mutex only
+    assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    assert(mutex.is_locked());
+
+    struct timespec ts;
+    when.to_timespec(&ts);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int WaitInterval(Mutex &mutex, utime_t interval) {
+    utime_t when = ceph_clock_now();
+    when += interval;
+    return WaitUntil(mutex, when);
+  }
+
+  template<typename Duration>
+  int WaitInterval(Mutex &mutex, Duration interval) {
+    ceph::real_time when(ceph::real_clock::now());
+    when += interval;
+
+    struct timespec ts = ceph::real_clock::to_timespec(when);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int SloppySignal() { 
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int Signal() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+          waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int SignalOne() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+          waiter_mutex->is_locked());
+
+    int r = pthread_cond_signal(&_c);
+    return r;
+  }
+  int SignalAll() { 
+    // make sure signaler is holding the waiter's lock.
+    assert(waiter_mutex == NULL ||
+          waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+};
+
+#endif // CEPH_COND_VAR_H
diff --git a/ceph/src/common/TrackedOp.cc b/ceph/src/common/TrackedOp.cc

index 4ed2fa48bd6f3f515835770045678a2bbcc5b25f..788b29744488d57ca88c07585f23ac5d88686bc8 100644 (file)
--- a/ceph/src/common/TrackedOp.cc
+++ b/ceph/src/common/TrackedOp.cc
@@ -342,8 +342,10 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector, int *sl
      while (i != sdata->ops_in_flight_sharded.end() &&
            i->get_initiated() < too_old) {
  
-      if (!i->warn_interval_multiplier)
+      if (!i->warn_interval_multiplier) {
+       ++i;
         continue;
+      }
  
        (*slow)++;
  
diff --git a/ceph/src/common/WeightedPriorityQueue.h b/ceph/src/common/WeightedPriorityQueue.h

index 64ac120bfa88a8469ad1683690d48e20dc0fdf2d..fa463b4c404983ef204cbd8c107d443f602b232e 100644 (file)
--- a/ceph/src/common/WeightedPriorityQueue.h
+++ b/ceph/src/common/WeightedPriorityQueue.h
@@ -67,8 +67,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
          K key;         // klass
          ListPairs lp;
          Klass(K& k) :
-          key(k)
-          {}
+          key(k) {
+        }
+        ~Klass() {
+          lp.clear_and_dispose(DelItem<ListPair>());
+        }
        friend bool operator< (const Klass &a, const Klass &b)
          { return a.key < b.key; }
        friend bool operator> (const Klass &a, const Klass &b)
@@ -129,8 +132,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
         Kit next;
         SubQueue(unsigned& p) :
           key(p),
-         next(klasses.begin())
-         {}
+         next(klasses.begin()) {
+       }
+       ~SubQueue() {
+         klasses.clear_and_dispose(DelItem<Klass>());
+       }
        friend bool operator< (const SubQueue &a, const SubQueue &b)
          { return a.key < b.key; }
        friend bool operator> (const SubQueue &a, const SubQueue &b)
@@ -195,8 +201,11 @@ class WeightedPriorityQueue :  public OpQueue <T, K>
         Queue() :
           total_prio(0),
           max_cost(0),
-         size(0)
-         {}
+         size(0) {
+       }
+       ~Queue() {
+         queues.clear_and_dispose(DelItem<SubQueue>());
+       }
         bool empty() const {
           return !size;
         }
diff --git a/ceph/src/common/buffer.cc b/ceph/src/common/buffer.cc

index 09dcc67b27971dbb72c9831fb154a062ffcfd4b5..cf63639a0b47fd120e77b732ab6625c608f350b0 100644 (file)
--- a/ceph/src/common/buffer.cc
+++ b/ceph/src/common/buffer.cc
@@ -1723,6 +1723,32 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
      }
    }
  
+  uint64_t buffer::list::get_wasted_space() const
+  {
+    if (_buffers.size() == 1)
+      return _buffers.back().wasted();
+
+    std::vector<const raw*> raw_vec;
+    raw_vec.reserve(_buffers.size());
+    for (const auto& p : _buffers)
+      raw_vec.push_back(p.get_raw());
+    std::sort(raw_vec.begin(), raw_vec.end());
+
+    uint64_t total = 0;
+    const raw *last = nullptr;
+    for (const auto r : raw_vec) {
+      if (r == last)
+       continue;
+      last = r;
+      total += r->len;
+    }
+    // If multiple buffers are sharing the same raw buffer and they overlap
+    // with each other, the wasted space will be underestimated.
+    if (total <= length())
+      return 0;
+    return total - length();
+  }
+
    void buffer::list::rebuild()
    {
      if (_len == 0) {
diff --git a/ceph/src/common/ceph_context.cc b/ceph/src/common/ceph_context.cc

index 0afdc3ac66cd83c552ba7d3300e3dcaea932fc36..87194f7dd5370099b4ed8fee4f73e85cec8b820b 100644 (file)
--- a/ceph/src/common/ceph_context.cc
+++ b/ceph/src/common/ceph_context.cc
@@ -34,7 +34,8 @@ namespace {
  
  class LockdepObs : public md_config_obs_t {
  public:
-  explicit LockdepObs(CephContext *cct) : m_cct(cct), m_registered(false) {
+  explicit LockdepObs(CephContext *cct)
+    : m_cct(cct), m_registered(false), lock("lock_dep_obs", false, true) {
    }
    ~LockdepObs() override {
      if (m_registered) {
@@ -49,6 +50,7 @@ public:
  
    void handle_conf_change(const md_config_t *conf,
                            const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
      if (conf->lockdep && !m_registered) {
        lockdep_register_ceph_context(m_cct);
        m_registered = true;
@@ -60,14 +62,17 @@ public:
  private:
    CephContext *m_cct;
    bool m_registered;
+  Mutex lock;
  };
  
  class MempoolObs : public md_config_obs_t,
                   public AdminSocketHook {
    CephContext *cct;
+  Mutex lock;
  
  public:
-  explicit MempoolObs(CephContext *cct) : cct(cct) {
+  explicit MempoolObs(CephContext *cct)
+    : cct(cct), lock("mem_pool_obs", false, true) {
      cct->_conf->add_observer(this);
      int r = cct->get_admin_socket()->register_command(
        "dump_mempools",
@@ -92,6 +97,7 @@ public:
  
    void handle_conf_change(const md_config_t *conf,
                            const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
      if (changed.count("mempool_debug")) {
        mempool::set_debug_mode(cct->_conf->mempool_debug);
      }
@@ -184,9 +190,12 @@ private:
   */
  class LogObs : public md_config_obs_t {
    ceph::logging::Log *log;
+  Mutex lock;
  
  public:
-  explicit LogObs(ceph::logging::Log *l) : log(l) {}
+  explicit LogObs(ceph::logging::Log *l)
+    : log(l), lock("log_obs", false, true) {
+  }
  
    const char** get_tracked_conf_keys() const override {
      static const char *KEYS[] = {
@@ -211,6 +220,7 @@ public:
  
    void handle_conf_change(const md_config_t *conf,
                            const std::set <std::string> &changed) override {
+    Mutex::Locker locker(lock);
      // stderr
      if (changed.count("log_to_stderr") || changed.count("err_to_stderr")) {
        int l = conf->log_to_stderr ? 99 : (conf->err_to_stderr ? -1 : -2);
diff --git a/ceph/src/common/cmdparse.h b/ceph/src/common/cmdparse.h

index 41495f5551a283c3628c86b608fdf3e13216931c..38d6f98aa9c764b7d8f06d180a45b4e15577b905 100644 (file)
--- a/ceph/src/common/cmdparse.h
+++ b/ceph/src/common/cmdparse.h
@@ -46,31 +46,74 @@ void handle_bad_get(CephContext *cct, const std::string& k, const char *name);
  
  std::string cmd_vartype_stringify(const cmd_vartype& v);
  
+struct bad_cmd_get : public std::exception {
+  std::string desc;
+  bad_cmd_get(const std::string& f, const cmdmap_t& cmdmap) {
+    desc = "bad or missing field '" + f + "'";
+  }
+  const char *what() const throw() override {
+    return desc.c_str();
+  }
+};
+
  template <typename T>
-bool
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val)
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+               T& val)
  {
    if (cmdmap.count(k)) {
      try {
        val = boost::get<T>(cmdmap.find(k)->second);
        return true;
-    } catch (boost::bad_get) {
+    } catch (boost::bad_get&) {
        handle_bad_get(cct, k, typeid(T).name());
      }
    }
    return false;
  }
  
+template <typename T>
+bool cmd_getval_throws(CephContext *cct, const cmdmap_t& cmdmap,
+                      const std::string& k, T& val)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  }
+  return false;
+}
+
  // with default
  
  template <typename T>
-void
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val, const T& defval)
+void cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+               T& val, const T& defval)
  {
    if (!cmd_getval(cct, cmdmap, k, val))
      val = defval;
  }
  
+template <typename T>
+bool cmd_getval_throws(
+  CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+  T& val, const T& defval)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  } else {
+    val = defval;
+    return true;
+  }
+}
+
  template <typename T>
  void
  cmd_putval(CephContext *cct, cmdmap_t& cmdmap, const std::string& k, const T& val)
diff --git a/ceph/src/common/config.cc b/ceph/src/common/config.cc

index b3a98a595f2e4089bf9fb017e0a880ca8b9ed446..ef348f95d5c3d86d2a2b0f6b5c7211bb455bae13 100644 (file)
--- a/ceph/src/common/config.cc
+++ b/ceph/src/common/config.cc
@@ -20,6 +20,7 @@
  #include "osd/osd_types.h"
  #include "common/errno.h"
  #include "common/hostname.h"
+#include "common/backport14.h"
  
  #include <boost/type_traits.hpp>
  
@@ -197,11 +198,16 @@ void md_config_t::add_observer(md_config_obs_t* observer_)
      obs_map_t::value_type val(*k, observer_);
      observers.insert(val);
    }
+  obs_call_gate.emplace(observer_, ceph::make_unique<CallGate>());
  }
  
  void md_config_t::remove_observer(md_config_obs_t* observer_)
  {
    Mutex::Locker l(lock);
+
+  call_gate_close(observer_);
+  obs_call_gate.erase(observer_);
+
    bool found_obs = false;
    for (obs_map_t::iterator o = observers.begin(); o != observers.end(); ) {
      if (o->second == observer_) {
@@ -665,12 +671,21 @@ int md_config_t::parse_injectargs(std::vector<const char*>& args,
  
  void md_config_t::apply_changes(std::ostream *oss)
  {
-  Mutex::Locker l(lock);
-  /*
-   * apply changes until the cluster name is assigned
-   */
-  if (cluster.size())
-    _apply_changes(oss);
+  rev_obs_map_t rev_obs;
+  {
+    Mutex::Locker l(lock);
+    /*
+     * apply changes until the cluster name is assigned
+     */
+    if (cluster.size()) {
+      for_each_change(
+        oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+          map_observer_changes(obs, key, &rev_obs);
+        });
+    }
+  }
+
+  call_observers(rev_obs);
  }
  
  bool md_config_t::_internal_field(const string& s)
@@ -680,12 +695,8 @@ bool md_config_t::_internal_field(const string& s)
    return false;
  }
  
-void md_config_t::_apply_changes(std::ostream *oss)
+void md_config_t::for_each_change(std::ostream *oss, config_gather_cb callback)
  {
-  /* Maps observers to the configuration options that they care about which
-   * have changed. */
-  typedef std::map < md_config_obs_t*, std::set <std::string> > rev_obs_map_t;
-
    expand_all_meta();
  
    // expand_all_meta could have modified anything.  Copy it all out again.
@@ -697,9 +708,6 @@ void md_config_t::_apply_changes(std::ostream *oss)
      update_legacy_val(option, ptr);
    }
  
-  // create the reverse observer mapping, mapping observers to the set of
-  // changed keys that they'll get.
-  rev_obs_map_t robs;
    std::set <std::string> empty_set;
    char buf[128];
    char *bufptr = (char*)buf;
@@ -717,71 +725,68 @@ void md_config_t::_apply_changes(std::ostream *oss)
        }
      }
      for (obs_map_t::iterator r = range.first; r != range.second; ++r) {
-      rev_obs_map_t::value_type robs_val(r->second, empty_set);
-      pair < rev_obs_map_t::iterator, bool > robs_ret(robs.insert(robs_val));
-      std::set <std::string> &keys(robs_ret.first->second);
-      keys.insert(key);
+      callback(r->second, key);
      }
    }
  
    changed.clear();
-
-  // Make any pending observer callbacks
-  for (rev_obs_map_t::const_iterator r = robs.begin(); r != robs.end(); ++r) {
-    md_config_obs_t *obs = r->first;
-    obs->handle_conf_change(this, r->second);
-  }
-
  }
  
  void md_config_t::call_all_observers()
  {
-  std::map<md_config_obs_t*,std::set<std::string> > obs;
+  rev_obs_map_t rev_obs;
    {
      Mutex::Locker l(lock);
  
      expand_all_meta();
  
      for (auto r = observers.begin(); r != observers.end(); ++r) {
-      obs[r->second].insert(r->first);
+      map_observer_changes(r->second, r->first, &rev_obs);
      }
    }
-  for (auto p = obs.begin();
-       p != obs.end();
-       ++p) {
-    p->first->handle_conf_change(this, p->second);
-  }
+
+  call_observers(rev_obs);
  }
  
  int md_config_t::injectargs(const std::string& s, std::ostream *oss)
  {
    int ret;
-  Mutex::Locker l(lock);
-  char b[s.length()+1];
-  strcpy(b, s.c_str());
-  std::vector<const char*> nargs;
-  char *p = b;
-  while (*p) {
-    nargs.push_back(p);
-    while (*p && *p != ' ') p++;
-    if (!*p)
-      break;
-    *p++ = 0;
-    while (*p && *p == ' ') p++;
-  }
-  ret = parse_injectargs(nargs, oss);
-  if (!nargs.empty()) {
-    *oss << " failed to parse arguments: ";
-    std::string prefix;
-    for (std::vector<const char*>::const_iterator i = nargs.begin();
-        i != nargs.end(); ++i) {
-      *oss << prefix << *i;
-      prefix = ",";
+  rev_obs_map_t rev_obs;
+  {
+    Mutex::Locker l(lock);
+
+    char b[s.length()+1];
+    strcpy(b, s.c_str());
+    std::vector<const char*> nargs;
+    char *p = b;
+    while (*p) {
+      nargs.push_back(p);
+      while (*p && *p != ' ') p++;
+      if (!*p)
+        break;
+      *p++ = 0;
+      while (*p && *p == ' ') p++;
+    }
+    ret = parse_injectargs(nargs, oss);
+    if (!nargs.empty()) {
+      *oss << " failed to parse arguments: ";
+      std::string prefix;
+      for (std::vector<const char*>::const_iterator i = nargs.begin();
+           i != nargs.end(); ++i) {
+        *oss << prefix << *i;
+        prefix = ",";
+      }
+      *oss << "\n";
+      ret = -EINVAL;
      }
-    *oss << "\n";
-    ret = -EINVAL;
+
+    for_each_change(
+      oss, [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, &rev_obs);
+      });
    }
-  _apply_changes(oss);
+
+  call_observers(rev_obs);
    return ret;
  }
  
@@ -1389,3 +1394,26 @@ void md_config_t::complain_about_parse_errors(CephContext *cct)
    ::complain_about_parse_errors(cct, &parse_errors);
  }
  
+void md_config_t::call_observers(rev_obs_map_t &rev_obs) {
+  for (auto p : rev_obs) {
+    p.first->handle_conf_change(this, p.second);
+    // this can be done outside the lock as call_gate_enter()
+    // and remove_observer() are serialized via lock
+    call_gate_leave(p.first);
+  }
+}
+
+void md_config_t::map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                                       rev_obs_map_t *rev_obs) {
+  ceph_assert(lock.is_locked());
+
+  auto p = rev_obs->emplace(obs, std::set<std::string>{});
+
+  p.first->second.emplace(key);
+  if (p.second) {
+    // this needs to be done under lock as once this lock is
+    // dropped (before calling observers) a remove_observer()
+    // can sneak in and cause havoc.
+    call_gate_enter(p.first->first);
+  }
+}
diff --git a/ceph/src/common/config.h b/ceph/src/common/config.h

index 612f083d89cfe06ee536fb2b8e6803fc4b9c085f..1145e12e3c58652e39058cdecc89bb01a842aff4 100644 (file)
--- a/ceph/src/common/config.h
+++ b/ceph/src/common/config.h
@@ -19,6 +19,7 @@
  #include "common/entity_name.h"
  #include "common/code_environment.h"
  #include "common/Mutex.h"
+#include "common/CondVar.h"
  #include "log/SubsystemMap.h"
  #include "common/config_obs.h"
  #include "common/options.h"
@@ -65,6 +66,62 @@ extern const char *CEPH_CONF_FILE_DEFAULT;
   * while another thread is reading them, either.
   */
  struct md_config_t {
+private:
+  class CallGate {
+  private:
+    uint32_t call_count = 0;
+    Mutex lock;
+    Cond cond;
+  public:
+    CallGate()
+      : lock("call::gate::lock", false, true) {
+    }
+
+    void enter() {
+      Mutex::Locker locker(lock);
+      ++call_count;
+    }
+    void leave() {
+      Mutex::Locker locker(lock);
+      ceph_assert(call_count > 0);
+      if (--call_count == 0) {
+        cond.Signal();
+      }
+    }
+    void close() {
+      Mutex::Locker locker(lock);
+      while (call_count != 0) {
+        cond.Wait(lock);
+      }
+    }
+  };
+
+  void call_gate_enter(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->enter();
+  }
+  void call_gate_leave(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->leave();
+  }
+  void call_gate_close(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->close();
+  }
+
+  typedef std::unique_ptr<CallGate> CallGateRef;
+  std::map<md_config_obs_t*, CallGateRef> obs_call_gate;
+
+  typedef std::map<md_config_obs_t*, std::set<std::string>> rev_obs_map_t;
+  typedef std::function<void(md_config_obs_t*, const std::string&)> config_gather_cb;
+
+  void call_observers(rev_obs_map_t &rev_obs);
+  void map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                            rev_obs_map_t *rev_obs);
+
  public:
    typedef boost::variant<int64_t md_config_t::*,
                           uint64_t md_config_t::*,
@@ -135,7 +192,7 @@ public:
  
    // Expand all metavariables. Make any pending observer callbacks.
    void apply_changes(std::ostream *oss);
-  void _apply_changes(std::ostream *oss);
+  void for_each_change(std::ostream *oss, config_gather_cb callback);
    bool _internal_field(const string& k);
    void call_all_observers();
  
diff --git a/ceph/src/common/hobject.h b/ceph/src/common/hobject.h

index 24eeb9754e2f24290828a03e3fa372cb5aac691b..8b68f85b4675e51ddc695de69ff4e64c93095aac 100644 (file)
--- a/ceph/src/common/hobject.h
+++ b/ceph/src/common/hobject.h
@@ -35,6 +35,21 @@ namespace ceph {
  #endif
  
  struct hobject_t {
+public:
+  static const int64_t POOL_META = -1;
+  static const int64_t POOL_TEMP_START = -2; // and then negative
+
+  static bool is_temp_pool(int64_t pool) {
+    return pool <= POOL_TEMP_START;
+  }
+  static int64_t get_temp_pool(int64_t pool) {
+    return POOL_TEMP_START - pool;
+  }
+  static bool is_meta_pool(int64_t pool) {
+    return pool == POOL_META;
+  }
+
+public:
    object_t oid;
    snapid_t snap;
  private:
@@ -42,9 +57,6 @@ private:
    bool max;
    uint32_t nibblewise_key_cache;
    uint32_t hash_reverse_bits;
-  static const int64_t POOL_META = -1;
-  static const int64_t POOL_TEMP_START = -2; // and then negative
-  friend class spg_t;  // for POOL_TEMP_START
  public:
    int64_t pool;
    string nspace;
@@ -84,10 +96,16 @@ public:
    }
  
    bool is_temp() const {
-    return pool <= POOL_TEMP_START && pool != INT64_MIN;
+    return is_temp_pool(pool) && pool != INT64_MIN;
    }
    bool is_meta() const {
-    return pool == POOL_META;
+    return is_meta_pool(pool);
+  }
+  int64_t get_logical_pool() const {
+    if (is_temp_pool(pool))
+      return get_temp_pool(pool);  // it's reversible
+    else
+      return pool;
    }
  
    hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
@@ -258,7 +276,8 @@ public:
    hobject_t make_temp_hobject(const string& name) const {
      return hobject_t(object_t(name), "", CEPH_NOSNAP,
                      hash,
-                    hobject_t::POOL_TEMP_START - pool, "");
+                    get_temp_pool(pool),
+                    "");
    }
  
    void swap(hobject_t &o) {
diff --git a/ceph/src/common/legacy_config_opts.h b/ceph/src/common/legacy_config_opts.h

index a51870ef64289ba12163b40b4c0a02ec4792995f..828697758124093a1071e6209fb33bff8ee87a5a 100644 (file)
--- a/ceph/src/common/legacy_config_opts.h
+++ b/ceph/src/common/legacy_config_opts.h
@@ -429,7 +429,6 @@ OPTION(journaler_write_head_interval, OPT_INT)
  OPTION(journaler_prefetch_periods, OPT_INT)   // * journal object size
  OPTION(journaler_prezero_periods, OPT_INT)     // * journal object size
  OPTION(mds_data, OPT_STR)
-OPTION(mds_max_file_size, OPT_U64) // Used when creating new CephFS. Change with 'ceph mds set max_file_size <size>' afterwards
  // max xattr kv pairs size for each dir/file
  OPTION(mds_max_xattr_pairs_size, OPT_U32)
  OPTION(mds_max_file_recover, OPT_U32)
@@ -440,17 +439,15 @@ OPTION(mds_beacon_interval, OPT_FLOAT)
  OPTION(mds_beacon_grace, OPT_FLOAT)
  OPTION(mds_enforce_unique_name, OPT_BOOL)
  
-OPTION(mds_session_timeout, OPT_FLOAT)    // cap bits and leases time out if client unresponsive or not returning its caps
  OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist clients whose sessions are dropped due to timeout
  OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
  
  OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
  OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
  OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
-OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
  OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
  OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
-             //  make it (mds_session_timeout - mds_beacon_grace)
+             //  make it (mdsmap.session_timeout - mds_beacon_grace)
  OPTION(mds_tick_interval, OPT_FLOAT)
  OPTION(mds_dirstat_min_interval, OPT_FLOAT)    // try to avoid propagating more often than this
  OPTION(mds_scatter_nudge_interval, OPT_FLOAT)  // how quickly dirstat changes propagate up the hierarchy
@@ -467,7 +464,6 @@ OPTION(mds_bal_export_pin, OPT_BOOL)  // allow clients to pin directory trees to
  OPTION(mds_bal_sample_interval, OPT_DOUBLE)  // every 3 seconds
  OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
  OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
-OPTION(mds_bal_frag, OPT_BOOL)
  OPTION(mds_bal_split_size, OPT_INT)
  OPTION(mds_bal_split_rd, OPT_FLOAT)
  OPTION(mds_bal_split_wr, OPT_FLOAT)
@@ -665,7 +661,8 @@ OPTION(osd_peering_wq_threads, OPT_INT)
  OPTION(osd_peering_wq_batch_size, OPT_U64)
  OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64)
  OPTION(osd_op_pq_min_cost, OPT_U64)
-OPTION(osd_disk_threads, OPT_INT)
+OPTION(osd_remove_threads, OPT_INT)
+OPTION(osd_recovery_threads, OPT_INT)
  OPTION(osd_disk_thread_ioprio_class, OPT_STR) // rt realtime be best effort idle
  OPTION(osd_disk_thread_ioprio_priority, OPT_INT) // 0-7
  OPTION(osd_recover_clone_overlap, OPT_BOOL)   // preserve clone_overlap during recovery/migration
@@ -847,6 +844,7 @@ OPTION(osd_op_history_duration, OPT_U32) // Oldest completed op to track
  OPTION(osd_op_history_slow_op_size, OPT_U32)           // Max number of slow ops to track
  OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
  OPTION(osd_target_transaction_size, OPT_INT)     // to adjust various transactions that batch smaller items
+OPTION(osd_delete_sleep, OPT_FLOAT)         // seconds to sleep between removal transactions
  OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
  OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
  
@@ -1003,6 +1001,9 @@ OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT)  // max fs free / total free
  OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time
  OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time
  OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore
+// how often (sec) to dump allocation failure happened during bluefs rebalance
+OPTION(bluestore_bluefs_balance_failure_dump_interval, OPT_FLOAT)
+
  // If you want to use spdk driver, you need to specify NVMe serial number here
  // with "spdk:" prefix.
  // Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
@@ -1031,6 +1032,7 @@ OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space i
  OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
  OPTION(bluestore_csum_min_block, OPT_U32)
  OPTION(bluestore_csum_max_block, OPT_U32)
+OPTION(bluestore_retry_disk_reads, OPT_U64)
  OPTION(bluestore_min_alloc_size, OPT_U32)
  OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
  OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
@@ -1124,6 +1126,7 @@ OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
  OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
  OPTION(bluestore_shard_finishers, OPT_BOOL)
  OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
  
  OPTION(kstore_max_ops, OPT_U64)
  OPTION(kstore_max_bytes, OPT_U64)
@@ -1540,6 +1543,7 @@ OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
  
  OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
  
+OPTION(rgw_trust_forwarded_https, OPT_BOOL) // trust Forwarded and X-Forwarded-Proto headers for ssl termination
  OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
  OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
  OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
diff --git a/ceph/src/common/options.cc b/ceph/src/common/options.cc

index ff3bb1a1be193d7c4e66310a83f106cb24982ce8..231a7651b17f9377acd368a9192c57c2b6eef553 100644 (file)
--- a/ceph/src/common/options.cc
+++ b/ceph/src/common/options.cc
@@ -898,7 +898,7 @@ std::vector<Option> get_global_options() {
      .set_description(""),
  
      Option("mon_osd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(10)
+    .set_default(500)
      .set_description(""),
  
      Option("mon_cpu_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
@@ -1950,7 +1950,11 @@ std::vector<Option> get_global_options() {
      .set_default(65536)
      .set_description(""),
  
-    Option("osd_disk_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("osd_remove_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("osd_recovery_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
      .set_default(1)
      .set_description(""),
  
@@ -2842,6 +2846,10 @@ std::vector<Option> get_global_options() {
      .set_default(30)
      .set_description(""),
  
+    Option("osd_delete_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next removal transaction"),
+
      Option("osd_failsafe_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(.97)
      .set_description(""),
@@ -2956,7 +2964,7 @@ std::vector<Option> get_global_options() {
      .set_description(""),
  
      Option("rocksdb_cache_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(128_M)
+    .set_default(512_M)
      .set_description(""),
  
      Option("rocksdb_cache_row_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
@@ -3164,6 +3172,7 @@ std::vector<Option> get_global_options() {
  
      Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
      .set_default(0.15)
+    .set_min_max(0.0, 1.0)
      .add_see_also("bluestore_cache_autotune")
      .set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),
  
@@ -3335,6 +3344,11 @@ std::vector<Option> get_global_options() {
      .set_default(1)
      .set_description("How frequently (in seconds) to balance free space between BlueFS and BlueStore"),
  
+    Option("bluestore_bluefs_balance_failure_dump_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("How frequently (in seconds) to dump information on "
+      "allocation failure occurred during BlueFS space rebalance"),
+
      Option("bluestore_spdk_mem", Option::TYPE_UINT, Option::LEVEL_DEV)
      .set_default(512)
      .set_description(""),
@@ -3422,6 +3436,12 @@ std::vector<Option> get_global_options() {
      .set_description("Maximum block size to checksum")
      .add_see_also("bluestore_csum_min_block"),
  
+    Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(0, 255)
+    .set_description("Number of read retries on checksum validation error")
+    .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
+
      Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(0)
      .add_tag("mkfs")
@@ -3791,6 +3811,10 @@ std::vector<Option> get_global_options() {
      .set_default(0)
      .set_description(""),
  
+    Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.0)
+    .set_description("inject crc verification errors into bluestore device reads"),
+
      // -----------------------------------------
      // kstore
  
@@ -5572,6 +5596,17 @@ std::vector<Option> get_rgw_options() {
      .set_default(120)
      .set_description(""),
  
+    Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Trust Forwarded and X-Forwarded-Proto headers")
+    .set_long_description(
+        "When a proxy in front of radosgw is used for ssl termination, radosgw "
+        "does not know whether incoming http connections are secure. Enable "
+        "this option to trust the Forwarded and X-Forwarded-Proto headers sent "
+        "by the proxy when determining whether the connection is secure. This "
+        "is required for some features, such as server side encryption.")
+    .add_see_also("rgw_crypt_require_ssl"),
+
      Option("rgw_crypt_require_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(true)
      .set_description("Requests including encryption key headers must be sent over ssl"),
@@ -5670,6 +5705,17 @@ std::vector<Option> get_rgw_options() {
                           "of RGW instances under heavy use. If you would like "
                           "to turn off cache expiry, set this value to zero."),
  
+    Option("rgw_max_listing_results", Option::TYPE_UINT,
+          Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_min_max(1, 100000)
+    .add_service("rgw")
+    .set_description("Upper bound on results in listing operations, ListBucket max-keys")
+    .set_long_description("This caps the maximum permitted value for listing-like operations in RGW S3. "
+                         "Affects ListBucket(max-keys), "
+                         "ListBucketVersions(max-keys), "
+                         "ListBucketMultiPartUploads(max-uploads), "
+                         "ListMultipartUploadParts(max-parts)"),
    });
  }
  
@@ -6026,10 +6072,6 @@ std::vector<Option> get_mds_options() {
      .set_default("/var/lib/ceph/mds/$cluster-$id")
      .set_description(""),
  
-    Option("mds_max_file_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(1ULL << 40)
-    .set_description(""),
-
      Option("mds_max_xattr_pairs_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
      .set_default(64 << 10)
      .set_description(""),
@@ -6080,14 +6122,14 @@ std::vector<Option> get_mds_options() {
      .set_default(15)
      .set_description(""),
  
+    Option("mds_heartbeat_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description("tolerance in seconds for MDS internal heartbeat"),
+
      Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(true)
      .set_description(""),
  
-    Option("mds_session_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description(""),
-
      Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(true)
      .set_description(""),
@@ -6108,10 +6150,6 @@ std::vector<Option> get_mds_options() {
      .set_default(30)
      .set_description(""),
  
-    Option("mds_session_autoclose", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(300)
-    .set_description(""),
-
      Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
      .set_default(10)
      .set_description(""),
@@ -6184,10 +6222,6 @@ std::vector<Option> get_mds_options() {
      .set_default(0)
      .set_description(""),
  
-    Option("mds_bal_frag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
-    .set_default(true)
-    .set_description(""),
-
      Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
      .set_default(10000)
      .set_description(""),
@@ -6509,6 +6543,16 @@ std::vector<Option> get_mds_options() {
      Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
       .set_default(0)
       .set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
+
+    Option("mds_dump_cache_threshold_formatter", Option::TYPE_UINT, Option::LEVEL_DEV)
+     .set_default(1_G)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to formatter")
+     .set_long_description("Disallow MDS from dumping caches to formatter via \"dump cache\" command if cache usage exceeds this threshold."),
+
+    Option("mds_dump_cache_threshold_file", Option::TYPE_UINT, Option::LEVEL_DEV)
+     .set_default(0)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to file")
+     .set_long_description("Disallow MDS from dumping caches to file via \"dump cache\" command if cache usage exceeds this threshold."),
    });
  }
  
diff --git a/ceph/src/crush/CrushCompiler.cc b/ceph/src/crush/CrushCompiler.cc

index 5d8415122495311faf5dfb84aee36fb474cafc9c..ae9152de660ec6c4bf9934952f907184bb2312b9 100644 (file)
--- a/ceph/src/crush/CrushCompiler.cc
+++ b/ceph/src/crush/CrushCompiler.cc
@@ -1048,9 +1048,13 @@ void CrushCompiler::find_used_bucket_ids(iter_t const& i)
  {
    for (iter_t p = i->children.begin(); p != i->children.end(); p++) {
      if ((int)p->value.id().to_long() == crush_grammar::_bucket) {
-      iter_t firstline = p->children.begin() + 3;
-      string tag = string_node(firstline->children[0]);
-      if (tag == "id") {
+      for (iter_t firstline = p->children.begin() + 3;
+          firstline != p->children.end();
+          ++firstline) {
+       string tag = string_node(firstline->children[0]);
+       if (tag != "id") {
+         break;
+       }
         int id = int_node(firstline->children[1]);
         //err << "saw bucket id " << id << std::endl;
         id_item[id] = string();
diff --git a/ceph/src/crush/CrushTester.cc b/ceph/src/crush/CrushTester.cc

index b6b4b2f20c3b17d784c8dbf7204106d85ccde405..86f91ef3d22587bbcd16c39d811de5989abcd2a3 100644 (file)
--- a/ceph/src/crush/CrushTester.cc
+++ b/ceph/src/crush/CrushTester.cc
@@ -723,3 +723,80 @@ int CrushTester::test()
  
    return 0;
  }
+
+int CrushTester::compare(CrushWrapper& crush2)
+{
+  if (min_rule < 0 || max_rule < 0) {
+    min_rule = 0;
+    max_rule = crush.get_max_rules() - 1;
+  }
+  if (min_x < 0 || max_x < 0) {
+    min_x = 0;
+    max_x = 1023;
+  }
+
+  // initial osd weights
+  vector<__u32> weight;
+
+  /*
+   * note device weight is set by crushtool
+   * (likely due to a given a command line option)
+   */
+  for (int o = 0; o < crush.get_max_devices(); o++) {
+    if (device_weight.count(o)) {
+      weight.push_back(device_weight[o]);
+    } else if (crush.check_item_present(o)) {
+      weight.push_back(0x10000);
+    } else {
+      weight.push_back(0);
+    }
+  }
+
+  // make adjustments
+  adjust_weights(weight);
+
+  map<int,int> bad_by_rule;
+
+  int ret = 0;
+  for (int r = min_rule; r < crush.get_max_rules() && r <= max_rule; r++) {
+    if (!crush.rule_exists(r)) {
+      if (output_statistics)
+        err << "rule " << r << " dne" << std::endl;
+      continue;
+    }
+    if (ruleset >= 0 &&
+       crush.get_rule_mask_ruleset(r) != ruleset) {
+      continue;
+    }
+    int minr = min_rep, maxr = max_rep;
+    if (min_rep < 0 || max_rep < 0) {
+      minr = crush.get_rule_mask_min_size(r);
+      maxr = crush.get_rule_mask_max_size(r);
+    }
+    int bad = 0;
+    for (int nr = minr; nr <= maxr; nr++) {
+      for (int x = min_x; x <= max_x; ++x) {
+       vector<int> out;
+       crush.do_rule(r, x, out, nr, weight, 0);
+       vector<int> out2;
+       crush2.do_rule(r, x, out2, nr, weight, 0);
+       if (out != out2) {
+         ++bad;
+       }
+      }
+    }
+    if (bad) {
+      ret = -1;
+    }
+    int max = (maxr - minr + 1) * (max_x - min_x + 1);
+    double ratio = (double)bad / (double)max;
+    cout << "rule " << r << " had " << bad << "/" << max
+        << " mismatched mappings (" << ratio << ")" << std::endl;
+  }
+  if (ret) {
+    cerr << "warning: maps are NOT equivalent" << std::endl;
+  } else {
+    cout << "maps appear equivalent" << std::endl;
+  }
+  return ret;
+}
diff --git a/ceph/src/crush/CrushTester.h b/ceph/src/crush/CrushTester.h

index e58c24875749ccb24542dc6df709700de616ccb8..c4257b63d6bc1ac7f4eac19856b37664d6a54924 100644 (file)
--- a/ceph/src/crush/CrushTester.h
+++ b/ceph/src/crush/CrushTester.h
@@ -359,6 +359,8 @@ public:
    void check_overlapped_rules() const;
    int test();
    int test_with_fork(int timeout);
+
+  int compare(CrushWrapper& other);
  };
  
  #endif
diff --git a/ceph/src/crush/CrushWrapper.cc b/ceph/src/crush/CrushWrapper.cc

index ed55b05624b8fd31ff4a361b1fa8cecee224b808..ca8d9059fd99d976b6c5d6d00f53897246f38e2c 100644 (file)
--- a/ceph/src/crush/CrushWrapper.cc
+++ b/ceph/src/crush/CrushWrapper.cc
@@ -1607,18 +1607,432 @@ int32_t CrushWrapper::_alloc_class_id() const {
    assert(0 == "no available class id");
  }
  
-void CrushWrapper::reweight(CephContext *cct)
+int CrushWrapper::set_subtree_class(
+  const string& subtree,
+  const string& new_class)
  {
+  if (!name_exists(subtree)) {
+    return -ENOENT;
+  }
+
+  int new_class_id = -1;
+  if (class_exists(new_class)) {
+    new_class_id = get_class_id(new_class);
+  } else {
+    for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+    class_name[new_class_id] = new_class;
+    class_rname[new_class] = new_class_id;
+  }
+
+  int id = get_item_id(subtree);
+  list<int> q = { id };
+  while (!q.empty()) {
+    int id = q.front();
+    q.pop_front();
+    crush_bucket *b = get_bucket(id);
+    if (IS_ERR(b)) {
+      return PTR_ERR(b);
+    }
+    for (unsigned i = 0; i < b->size; ++i) {
+      int item = b->items[i];
+      if (item >= 0) {
+       class_map[item] = new_class_id;
+      } else {
+       q.push_back(item);
+      }
+    }
+  }
+  return 0;
+}
+
+int CrushWrapper::reclassify(
+  CephContext *cct,
+  ostream& out,
+  const map<string,string>& classify_root,
+  const map<string,pair<string,string>>& classify_bucket
+  )
+{
+  map<int,string> reclassified_bucket; // orig_id -> class
+
+  // classify_root
+  for (auto& i : classify_root) {
+    string root = i.first;
+    if (!name_exists(root)) {
+      out << "root " << root << " does not exist" << std::endl;
+      return -EINVAL;
+    }
+    int root_id = get_item_id(root);
+    string new_class = i.second;
+    int new_class_id = -1;
+    out << "classify_root " << root << " (" << root_id
+       << ") as " << new_class << std::endl;
+    if (class_exists(new_class)) {
+      new_class_id = get_class_id(new_class);
+      out << "  new class " << new_class << " exists as " << new_class_id
+         << std::endl;
+    } else {
+      for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) ;
+      class_name[new_class_id] = new_class;
+      class_rname[new_class] = new_class_id;
+      out << "  created new class " << new_class << " as " << new_class_id
+         << std::endl;
+    }
+
+    // validate rules
+    for (unsigned j = 0; j < crush->max_rules; j++) {
+      if (crush->rules[j]) {
+       auto rule = crush->rules[j];
+       for (unsigned k = 0; k < rule->len; ++k) {
+         if (rule->steps[k].op == CRUSH_RULE_TAKE) {
+           int step_item = get_rule_arg1(j, k);
+           int original_item;
+           int c;
+           int res = split_id_class(step_item, &original_item, &c);
+           if (res < 0)
+             return res;
+           if (c >= 0) {
+             if (original_item == root_id) {
+               out << "  rule " << j << " includes take on root "
+                   << root << " class " << c << std::endl;
+               return -EINVAL;
+             }
+           }
+         }
+       }
+      }
+    }
+
+    // rebuild new buckets for root
+    //cout << "before class_bucket: " << class_bucket << std::endl;
+    map<int,int> renumber;
+    list<int> q;
+    q.push_back(root_id);
+    while (!q.empty()) {
+      int id = q.front();
+      q.pop_front();
+      crush_bucket *bucket = get_bucket(id);
+      if (IS_ERR(bucket)) {
+       out << "cannot find bucket " << id
+           << ": " << cpp_strerror(PTR_ERR(bucket)) << std::endl;
+       return PTR_ERR(bucket);
+      }
+
+      // move bucket
+      int new_id = get_new_bucket_id();
+      out << "  renumbering bucket " << id << " -> " << new_id << std::endl;
+      renumber[id] = new_id;
+      crush->buckets[-1-new_id] = bucket;
+      bucket->id = new_id;
+      crush->buckets[-1-id] = crush_make_bucket(crush,
+                                               bucket->alg,
+                                               bucket->hash,
+                                               bucket->type,
+                                               0, NULL, NULL);
+      crush->buckets[-1-id]->id = id;
+      for (auto& i : choose_args) {
+       i.second.args[-1-new_id] = i.second.args[-1-id];
+       memset(&i.second.args[-1-id], 0, sizeof(i.second.args[0]));
+      }
+      class_bucket.erase(id);
+      class_bucket[new_id][new_class_id] = id;
+      name_map[new_id] = string(get_item_name(id));
+      name_map[id] = string(get_item_name(id)) + "~" + new_class;
+
+      for (unsigned j = 0; j < bucket->size; ++j) {
+       if (bucket->items[j] < 0) {
+         q.push_front(bucket->items[j]);
+       } else {
+         // we don't reclassify the device here; if the users wants that,
+         // they can pass --set-subtree-class separately.
+       }
+      }
+    }
+    //cout << "mid class_bucket: " << class_bucket << std::endl;
+
+    for (int i = 0; i < crush->max_buckets; ++i) {
+      crush_bucket *b = crush->buckets[i];
+      if (!b) {
+       continue;
+      }
+      for (unsigned j = 0; j < b->size; ++j) {
+       if (renumber.count(b->items[j])) {
+         b->items[j] = renumber[b->items[j]];
+       }
+      }
+    }
+
+    int r = rebuild_roots_with_classes();
+    if (r < 0) {
+      out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+         << std::endl;
+      return r;
+    }
+    //cout << "final class_bucket: " << class_bucket << std::endl;
+  }
+
+  // classify_bucket
+  map<int,int> send_to;  // source bucket -> dest bucket
+  map<int,map<int,int>> new_class_bucket;
+  map<int,string> new_bucket_names;
+  map<int,map<string,string>> new_buckets;
+  map<string,int> new_bucket_by_name;
+  for (auto& i : classify_bucket) {
+    const string& match = i.first;  // prefix% or %suffix
+    const string& new_class = i.second.first;
+    const string& default_parent = i.second.second;
+    if (!name_exists(default_parent)) {
+      out << "default parent " << default_parent << " does not exist"
+         << std::endl;
+      return -EINVAL;
+    }
+    int default_parent_id = get_item_id(default_parent);
+    crush_bucket *default_parent_bucket = get_bucket(default_parent_id);
+    assert(default_parent_bucket);
+    string default_parent_type_name = get_type_name(default_parent_bucket->type);
+
+    out << "classify_bucket " << match << " as " << new_class
+       << " default bucket " << default_parent
+       << " (" << default_parent_type_name << ")" << std::endl;
+
+    int new_class_id = -1;
+    if (class_exists(new_class)) {
+      new_class_id = get_class_id(new_class);
+      out << "  new class " << new_class << " exists as " << new_class_id
+         << std::endl;
+    } else {
+      for (new_class_id = 1; class_name.count(new_class_id); ++new_class_id) {
+      }
+      class_name[new_class_id] = new_class;
+      class_rname[new_class] = new_class_id;
+      out << "  created new class " << new_class << " as " << new_class_id
+         << std::endl;
+    }
+
+    for (int j = 0; j < crush->max_buckets; ++j) {
+      crush_bucket *b = crush->buckets[j];
+      if (!b || is_shadow_item(b->id)) {
+       continue;
+      }
+      string name = get_item_name(b->id);
+      if (name.length() < match.length()) {
+       continue;
+      }
+      string basename;
+      if (match[0] == '%') {
+       if (match.substr(1) != name.substr(name.size() - match.size() + 1)) {
+         continue;
+       }
+       basename = name.substr(0, name.size() - match.size() + 1);
+      } else if (match[match.size() - 1] == '%') {
+       if (match.substr(0, match.size() - 1) !=
+           name.substr(0, match.size() - 1)) {
+         continue;
+       }
+       basename = name.substr(match.size() - 1);
+      } else if (match == name) {
+       basename = default_parent;
+      } else {
+       continue;
+      }
+      cout << "match " << match << " to " << name << " basename " << basename
+          << std::endl;
+      // look up or create basename bucket
+      int base_id;
+      if (name_exists(basename)) {
+       base_id = get_item_id(basename);
+       cout << "  have base " << base_id << std::endl;
+      } else if (new_bucket_by_name.count(basename)) {
+       base_id = new_bucket_by_name[basename];
+       cout << "  already creating base " << base_id << std::endl;
+      } else {
+       base_id = get_new_bucket_id();
+       crush->buckets[-1-base_id] = crush_make_bucket(crush,
+                                                      b->alg,
+                                                      b->hash,
+                                                      b->type,
+                                                      0, NULL, NULL);
+       crush->buckets[-1-base_id]->id = base_id;
+       name_map[base_id] = basename;
+       new_bucket_by_name[basename] = base_id;
+       cout << "  created base " << base_id << std::endl;
+
+       new_buckets[base_id][default_parent_type_name] = default_parent;
+      }
+      send_to[b->id] = base_id;
+      new_class_bucket[base_id][new_class_id] = b->id;
+      new_bucket_names[b->id] = basename + "~" + get_class_name(new_class_id);
+
+      // make sure devices are classified
+      for (unsigned i = 0; i < b->size; ++i) {
+       int item = b->items[i];
+       if (item >= 0) {
+         class_map[item] = new_class_id;
+       }
+      }
+    }
+  }
+
+  // no name_exists() works below,
+  have_rmaps = false;
+
+  // copy items around
+  //cout << "send_to " << send_to << std::endl;
    set<int> roots;
    find_roots(&roots);
-  for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
-    if (*p >= 0)
+  for (auto& i : send_to) {
+    crush_bucket *from = get_bucket(i.first);
+    crush_bucket *to = get_bucket(i.second);
+    cout << "moving items from " << from->id << " (" << get_item_name(from->id)
+        << ") to " << to->id << " (" << get_item_name(to->id) << ")"
+        << std::endl;
+    for (unsigned j = 0; j < from->size; ++j) {
+      int item = from->items[j];
+      int r;
+      map<string,string> to_loc;
+      to_loc[get_type_name(to->type)] = get_item_name(to->id);
+      if (item >= 0) {
+       if (subtree_contains(to->id, item)) {
+         continue;
+       }
+       map<string,string> from_loc;
+       from_loc[get_type_name(from->type)] = get_item_name(from->id);
+       auto w = get_item_weightf_in_loc(item, from_loc);
+       r = insert_item(cct, item,
+                       w,
+                       get_item_name(item),
+                       to_loc);
+      } else {
+       if (!send_to.count(item)) {
+         lderr(cct) << "item " << item << " in bucket " << from->id
+              << " is not also a reclassified bucket" << dendl;
+         return -EINVAL;
+       }
+       int newitem = send_to[item];
+       if (subtree_contains(to->id, newitem)) {
+         continue;
+       }
+       r = link_bucket(cct, newitem, to_loc);
+      }
+      if (r != 0) {
+       cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+            << std::endl;
+       return r;
+      }
+    }
+  }
+
+  // make sure new buckets have parents
+  for (auto& i : new_buckets) {
+    int parent;
+    if (get_immediate_parent_id(i.first, &parent) < 0) {
+      cout << "new bucket " << i.first << " missing parent, adding at "
+          << i.second << std::endl;
+      int r = link_bucket(cct, i.first, i.second);
+      if (r != 0) {
+       cout << __func__ << " err from insert_item: " << cpp_strerror(r)
+            << std::endl;
+       return r;
+      }
+    }
+  }
+
+  // set class mappings
+  //cout << "pre class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_class_bucket) {
+    for (auto& j : i.second) {
+      class_bucket[i.first][j.first] = j.second;
+    }
+
+  }
+  //cout << "post class_bucket: " << class_bucket << std::endl;
+  for (auto& i : new_bucket_names) {
+    name_map[i.first] = i.second;
+  }
+
+  int r = rebuild_roots_with_classes();
+  if (r < 0) {
+    out << "failed to rebuild_roots_with_classes: " << cpp_strerror(r)
+       << std::endl;
+    return r;
+  }
+  //cout << "final class_bucket: " << class_bucket << std::endl;
+
+  return 0;
+}
+
+int CrushWrapper::get_new_bucket_id()
+{
+  int id = -1;
+  while (crush->buckets[-1-id] &&
+        -1-id < crush->max_buckets) {
+    id--;
+  }
+  if (-1-id == crush->max_buckets) {
+    ++crush->max_buckets;
+    crush->buckets = (struct crush_bucket**)realloc(
+      crush->buckets,
+      sizeof(crush->buckets[0]) * crush->max_buckets);
+    for (auto& i : choose_args) {
+      assert(i.second.size == crush->max_buckets - 1);
+      ++i.second.size;
+      i.second.args = (struct crush_choose_arg*)realloc(
+       i.second.args,
+       sizeof(i.second.args[0]) * i.second.size);
+    }
+  }
+  return id;
+}
+
+void CrushWrapper::reweight(CephContext *cct)
+{
+  set<int> roots;
+  find_nonshadow_roots(&roots);
+  for (auto id : roots) {
+    if (id >= 0)
        continue;
-    crush_bucket *b = get_bucket(*p);
-    ldout(cct, 5) << "reweight bucket " << *p << dendl;
+    crush_bucket *b = get_bucket(id);
+    ldout(cct, 5) << "reweight root bucket " << id << dendl;
      int r = crush_reweight_bucket(crush, b);
      assert(r == 0);
+
+    for (auto& i : choose_args) {
+      //cout << "carg " << i.first << std::endl;
+      vector<uint32_t> w;  // discard top-level weights
+      reweight_bucket(b, i.second, &w);
+    }
+  }
+  int r = rebuild_roots_with_classes();
+  ceph_assert(r == 0);
+}
+
+void CrushWrapper::reweight_bucket(
+  crush_bucket *b,
+  crush_choose_arg_map& arg_map,
+  vector<uint32_t> *weightv)
+{
+  int idx = -1 - b->id;
+  unsigned npos = arg_map.args[idx].weight_set_positions;
+  //cout << __func__ << " " << b->id << " npos " << npos << std::endl;
+  weightv->resize(npos);
+  for (unsigned i = 0; i < b->size; ++i) {
+    int item = b->items[i];
+    if (item >= 0) {
+      for (unsigned pos = 0; pos < npos; ++pos) {
+       (*weightv)[pos] += arg_map.args[idx].weight_set->weights[i];
+      }
+    } else {
+      vector<uint32_t> subw(npos);
+      crush_bucket *sub = get_bucket(item);
+      assert(sub);
+      reweight_bucket(sub, arg_map, &subw);
+      for (unsigned pos = 0; pos < npos; ++pos) {
+       (*weightv)[pos] += subw[pos];
+       // strash the real bucket weight as the weights for this reference
+       arg_map.args[idx].weight_set->weights[i] = subw[pos];
+      }
+    }
    }
+  //cout << __func__ << " finish " << b->id << " " << *weightv << std::endl;
  }
  
  int CrushWrapper::add_simple_rule_at(
diff --git a/ceph/src/crush/CrushWrapper.h b/ceph/src/crush/CrushWrapper.h

index b18e4f8e6061a1e8b9168e973d2d1d3dc65d8a2e..8063e8ab82a365932a5ca6b8a0510a4dc876d7e1 100644 (file)
--- a/ceph/src/crush/CrushWrapper.h
+++ b/ceph/src/crush/CrushWrapper.h
@@ -948,6 +948,9 @@ public:
      return adjust_item_weight_in_loc(cct, id, (int)(weight * (float)0x10000), loc);
    }
    void reweight(CephContext *cct);
+  void reweight_bucket(crush_bucket *b,
+                      crush_choose_arg_map& arg_map,
+                      vector<uint32_t> *weightv);
  
    int adjust_subtree_weight(CephContext *cct, int id, int weight);
    int adjust_subtree_weightf(CephContext *cct, int id, float weight) {
@@ -1186,6 +1189,8 @@ private:
     **/
    int detach_bucket(CephContext *cct, int item);
  
+  int get_new_bucket_id();
+
  public:
    int get_max_buckets() const {
      if (!crush) return -EINVAL;
@@ -1286,6 +1291,15 @@ public:
    /* remove unused roots generated for class devices */
    int trim_roots_with_class();
  
+  int reclassify(
+    CephContext *cct,
+    ostream& out,
+    const map<string,string>& classify_root,
+    const map<string,pair<string,string>>& classify_bucket
+    );
+
+  int set_subtree_class(const string& name, const string& class_name);
+
    void start_choose_profile() {
      free(crush->choose_tries);
      /*
diff --git a/ceph/src/include/buffer.h b/ceph/src/include/buffer.h

index 596f15b9951a931b2081125e5087b7bfc28e65a6..4892be61294ccc2458d6c211c2bae15d6ec8b825 100644 (file)
--- a/ceph/src/include/buffer.h
+++ b/ceph/src/include/buffer.h
@@ -692,6 +692,7 @@ namespace buffer CEPH_BUFFER_API {
        return *this;
      }
  
+    uint64_t get_wasted_space() const;
      unsigned get_num_buffers() const { return _buffers.size(); }
      const ptr& front() const { return _buffers.front(); }
      const ptr& back() const { return _buffers.back(); }
diff --git a/ceph/src/include/ceph_features.h b/ceph/src/include/ceph_features.h

index bea6f49fb160760e13d12644f6652573809ce353..ac51eb246d5be64f5e6145f54cf416593b50f823 100755 (executable)
--- a/ceph/src/include/ceph_features.h
+++ b/ceph/src/include/ceph_features.h
@@ -93,6 +93,7 @@ DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
  
  DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
  DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT)
  
  DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
  
@@ -231,6 +232,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
          CEPH_FEATURE_RADOS_BACKOFF |           \
          CEPH_FEATURE_OSD_RECOVERY_DELETES | \
          CEPH_FEATURE_CEPHX_V2 | \
+        CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \
          0ULL)
  
  #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/ceph/src/include/ceph_fs.h b/ceph/src/include/ceph_fs.h

index b40683db60de4fc9c69f69cf596291e7d2bcb244..f57f780a6e0dbdf5d48c271a21db78693bd4c5f0 100644 (file)
--- a/ceph/src/include/ceph_fs.h
+++ b/ceph/src/include/ceph_fs.h
@@ -379,10 +379,10 @@ extern const char *ceph_mds_op_name(int op);
  #define CEPH_SETATTR_ATIME     (1 << 4)
  #define CEPH_SETATTR_SIZE      (1 << 5)
  #define CEPH_SETATTR_CTIME     (1 << 6)
-#define CEPH_SETATTR_BTIME     (1 << 9)
-#endif
  #define CEPH_SETATTR_MTIME_NOW (1 << 7)
  #define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME     (1 << 9)
+#endif
  #define CEPH_SETATTR_KILL_SGUID        (1 << 10)
  
  /*
diff --git a/ceph/src/include/cephfs/libcephfs.h b/ceph/src/include/cephfs/libcephfs.h

index b6c0f706a419dd8209e9c4bcbd0e9d02f8288c14..cb90130fbce6f2d6750c4187e53908a652b7672e 100644 (file)
--- a/ceph/src/include/cephfs/libcephfs.h
+++ b/ceph/src/include/cephfs/libcephfs.h
@@ -116,6 +116,8 @@ struct CephContext;
  # define CEPH_SETATTR_ATIME    16
  # define CEPH_SETATTR_SIZE     32
  # define CEPH_SETATTR_CTIME    64
+# define CEPH_SETATTR_MTIME_NOW        128
+# define CEPH_SETATTR_ATIME_NOW        256
  # define CEPH_SETATTR_BTIME    512
  #endif
  
diff --git a/ceph/src/include/config-h.in.cmake b/ceph/src/include/config-h.in.cmake

index 19f1d0a5fe40ca3ec8220e14af99443cfbc31da4..5627d6392abd11287ca0ad8246576b24be159196 100644 (file)
--- a/ceph/src/include/config-h.in.cmake
+++ b/ceph/src/include/config-h.in.cmake
@@ -327,4 +327,7 @@
  /* Defined if boost::context is available */
  #cmakedefine HAVE_BOOST_CONTEXT
  
+/* Defined if OpenSSL is available for the rgw beast frontend */
+#cmakedefine WITH_RADOSGW_BEAST_OPENSSL
+
  #endif /* CONFIG_H */
diff --git a/ceph/src/include/rados.h b/ceph/src/include/rados.h

index e91236a011c47dab77d52767c7ac9cfb2f4f181b..f8fcc6b41fe3a83a1305e041c95f15fa9e339183 100644 (file)
--- a/ceph/src/include/rados.h
+++ b/ceph/src/include/rados.h
@@ -157,6 +157,7 @@ extern const char *ceph_osd_state_name(int s);
  #define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
  #define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
  #define CEPH_OSDMAP_PURGED_SNAPDIRS  (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT  (1<<22) /* put a hard limit on pg log length */
  
  /* these are hidden in 'ceph status' view */
  #define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL|       \
@@ -164,7 +165,8 @@ extern const char *ceph_osd_state_name(int s);
                                       CEPH_OSDMAP_REQUIRE_LUMINOUS |    \
                                       CEPH_OSDMAP_RECOVERY_DELETES |    \
                                       CEPH_OSDMAP_SORTBITWISE |         \
-                                     CEPH_OSDMAP_PURGED_SNAPDIRS)
+                                     CEPH_OSDMAP_PURGED_SNAPDIRS |     \
+                                      CEPH_OSDMAP_PGLOG_HARDLIMIT)
  #define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL |  \
                                           CEPH_OSDMAP_REQUIRE_KRAKEN |  \
                                           CEPH_OSDMAP_REQUIRE_LUMINOUS)
diff --git a/ceph/src/include/rados/librados.hpp b/ceph/src/include/rados/librados.hpp

index f4cd777df036e15a3b34266c034ad6704516770b..ad8276d2db77d3a44d066d0d4ba6bca40b4d9c0f 100644 (file)
--- a/ceph/src/include/rados/librados.hpp
+++ b/ceph/src/include/rados/librados.hpp
@@ -110,18 +110,19 @@ namespace librados
      bool operator!=(const NObjectIterator& rhs) const;
      const ListObject& operator*() const;
      const ListObject* operator->() const;
-    NObjectIterator &operator++(); // Preincrement
-    NObjectIterator operator++(int); // Postincrement
+    NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+    NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
      friend class IoCtx;
      friend class NObjectIteratorImpl;
  
      /// get current hash position of the iterator, rounded to the current pg
      uint32_t get_pg_hash_position() const;
  
-    /// move the iterator to a given hash position.  this may (will!) be rounded to the nearest pg.
+    /// move the iterator to a given hash position. this may (will!) be rounded
+    /// to the nearest pg. errors are thrown as exceptions
      uint32_t seek(uint32_t pos);
  
-    /// move the iterator to a given cursor position
+    /// move the iterator to a given cursor position. errors are thrown as exceptions
      uint32_t seek(const ObjectCursor& cursor);
  
      /// get current cursor position
@@ -890,14 +891,16 @@ namespace librados
                      std::list<librados::locker_t> *lockers);
  
  
-    /// Start enumerating objects for a pool
+    /// Start enumerating objects for a pool. Errors are thrown as exceptions.
      NObjectIterator nobjects_begin();
      NObjectIterator nobjects_begin(const bufferlist &filter);
-    /// Start enumerating objects for a pool starting from a hash position
+    /// Start enumerating objects for a pool starting from a hash position.
+    /// Errors are thrown as exceptions.
      NObjectIterator nobjects_begin(uint32_t start_hash_position);
      NObjectIterator nobjects_begin(uint32_t start_hash_position,
                                     const bufferlist &filter);
-    /// Start enumerating objects for a pool starting from cursor
+    /// Start enumerating objects for a pool starting from cursor. Errors are
+    /// thrown as exceptions.
      NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor);
      NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
                                     const bufferlist &filter);
diff --git a/ceph/src/librados/librados.cc b/ceph/src/librados/librados.cc

index b96c726fffbd2a6afc9b9facb1f4e671f2b89459..79d55c796dfeb0a42c10d30bd042ee93ee5e1066 100644 (file)
--- a/ceph/src/librados/librados.cc
+++ b/ceph/src/librados/librados.cc
@@ -56,7 +56,6 @@ using std::map;
  using std::set;
  using std::vector;
  using std::list;
-using std::runtime_error;
  
  #define dout_subsys ceph_subsys_rados
  #undef dout_prefix
@@ -809,9 +808,8 @@ void librados::NObjectIteratorImpl::get_next()
      return;
    }
    else if (ret) {
-    ostringstream oss;
-    oss << "rados returned " << cpp_strerror(ret);
-    throw std::runtime_error(oss.str());
+    throw std::system_error(-ret, std::system_category(),
+                            "rados_nobjects_list_next");
    }
  
    if (cur_obj.impl == NULL)
diff --git a/ceph/src/librbd/librbd.cc b/ceph/src/librbd/librbd.cc

index db07e5e2275bb9138de81ac95e71a09aeb722c46..1d4664eb2a6fdf48eb1aac543844d6bce7b3ad5b 100644 (file)
--- a/ceph/src/librbd/librbd.cc
+++ b/ceph/src/librbd/librbd.cc
@@ -3058,6 +3058,7 @@ extern "C" int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
      tracepoint(librbd, snap_list_exit, -EINVAL, 0);
      return -EINVAL;
    }
+  memset(snaps, 0, sizeof(*snaps) * *max_snaps);
  
    int r = librbd::snap_list(ictx, cpp_snaps);
    if (r == -ENOENT) {
diff --git a/ceph/src/librbd/operation/ResizeRequest.cc b/ceph/src/librbd/operation/ResizeRequest.cc

index 07e5158c97f243aee74bbc90468666182142514f..1e7e69afe8759c4d8d311db4ca6a1a07b43e0404 100644 (file)
--- a/ceph/src/librbd/operation/ResizeRequest.cc
+++ b/ceph/src/librbd/operation/ResizeRequest.cc
@@ -118,6 +118,7 @@ Context *ResizeRequest<I>::send_append_op_event() {
  
    if (m_new_size < m_original_size && !m_allow_shrink) {
      ldout(cct, 1) << " shrinking the image is not permitted" << dendl;
+    image_ctx.io_work_queue->unblock_writes();
      this->async_complete(-EINVAL);
      return nullptr;
    }
diff --git a/ceph/src/mds/CInode.cc b/ceph/src/mds/CInode.cc

index 8ba75de12c816b683c3d699c50b484f7b3c172e7..e8c1bc8bc1e3ece37bf17a1cb156ec83e76a2ddd 100644 (file)
--- a/ceph/src/mds/CInode.cc
+++ b/ceph/src/mds/CInode.cc
@@ -420,7 +420,10 @@ void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
    int64_t old_pool = inode.layout.pool_id;
  
    mark_dirty(front.inode.version, ls);
+  bool new_export_pin = inode.export_pin != front.inode.export_pin;
    inode = front.inode;
+  if (new_export_pin)
+    maybe_export_pin(true);
  
    if (inode.is_backtrace_updated())
      mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
@@ -4514,7 +4517,6 @@ void CInode::set_export_pin(mds_rank_t rank)
    assert(is_dir());
    assert(is_projected());
    get_projected_inode()->export_pin = rank;
-  maybe_export_pin(true);
  }
  
  mds_rank_t CInode::get_export_pin(bool inherit) const
@@ -4528,15 +4530,14 @@ mds_rank_t CInode::get_export_pin(bool inherit) const
    while (true) {
      if (in->is_system())
        break;
-    const CDentry *pdn = in->get_projected_parent_dn();
+    const CDentry *pdn = in->get_parent_dn();
      if (!pdn)
        break;
-    const mempool_inode *pi = in->get_projected_inode();
      // ignore export pin for unlinked directory
-    if (pi->nlink == 0)
+    if (in->get_inode().nlink == 0)
        break;
-    if (pi->export_pin >= 0)
-      return pi->export_pin;
+    if (in->get_inode().export_pin >= 0)
+      return in->get_inode().export_pin;
  
      if (!inherit)
        break;
diff --git a/ceph/src/mds/CInode.h b/ceph/src/mds/CInode.h

index 6e2b73b076973c806b106a9c957685d2b3b63d78..c7628fee645548d5bb7dc5c382efebd5a43e1b44 100644 (file)
--- a/ceph/src/mds/CInode.h
+++ b/ceph/src/mds/CInode.h
@@ -736,6 +736,7 @@ public:
    int d_type() const { return IFTODT(inode.mode); }
  
    mempool_inode& get_inode() { return inode; }
+  const mempool_inode& get_inode() const { return inode; }
    CDentry* get_parent_dn() { return parent; }
    const CDentry* get_parent_dn() const { return parent; }
    const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
diff --git a/ceph/src/mds/FSMap.cc b/ceph/src/mds/FSMap.cc

index a4f33db8d3b0101cadcfc232d10cbdf01aed7c8f..cfa1352255e546d1cfafcb76564917a3c2e3c563 100644 (file)
--- a/ceph/src/mds/FSMap.cc
+++ b/ceph/src/mds/FSMap.cc
@@ -232,16 +232,12 @@ void FSMap::create_filesystem(boost::string_view name,
    auto fs = std::make_shared<Filesystem>();
    fs->mds_map.epoch = epoch;
    fs->mds_map.fs_name = std::string(name);
-  fs->mds_map.max_mds = 1;
    fs->mds_map.data_pools.push_back(data_pool);
    fs->mds_map.metadata_pool = metadata_pool;
    fs->mds_map.cas_pool = -1;
-  fs->mds_map.max_file_size = g_conf->mds_max_file_size;
    fs->mds_map.compat = compat;
    fs->mds_map.created = ceph_clock_now();
    fs->mds_map.modified = ceph_clock_now();
-  fs->mds_map.session_timeout = g_conf->mds_session_timeout;
-  fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
    fs->mds_map.enabled = true;
    if (features & CEPH_FEATURE_SERVER_JEWEL) {
      fs->fscid = next_filesystem_id++;
@@ -276,17 +272,13 @@ void FSMap::reset_filesystem(fs_cluster_id_t fscid)
    // Carry forward what makes sense
    new_fs->fscid = fs->fscid;
    new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
-  new_fs->mds_map.max_mds = 1;
    new_fs->mds_map.data_pools = fs->mds_map.data_pools;
    new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
    new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
    new_fs->mds_map.fs_name = fs->mds_map.fs_name;
-  new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
    new_fs->mds_map.compat = compat;
    new_fs->mds_map.created = ceph_clock_now();
    new_fs->mds_map.modified = ceph_clock_now();
-  new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
-  new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
    new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
    new_fs->mds_map.enabled = true;
  
diff --git a/ceph/src/mds/Locker.cc b/ceph/src/mds/Locker.cc

index 058fcae7642c5aa9fe89750b007b541fc3001126..42b47087f24b3e923f5b195b369b0b18bf6b5363 100644 (file)
--- a/ceph/src/mds/Locker.cc
+++ b/ceph/src/mds/Locker.cc
@@ -2224,7 +2224,7 @@ void Locker::request_inode_file_caps(CInode *in)
  {
    assert(!in->is_auth());
  
-  int wanted = in->get_caps_wanted() & ~CEPH_CAP_PIN;
+  int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
    if (wanted != in->replica_caps_wanted) {
      // wait for single auth
      if (in->is_ambiguous_auth()) {
@@ -2873,9 +2873,9 @@ void Locker::handle_client_caps(MClientCaps *m)
  
      // filter wanted based on what we could ever give out (given auth/replica status)
      bool need_flush = m->flags & CLIENT_CAPS_SYNC;
-    int new_wanted = m->get_wanted() & head_in->get_caps_allowed_ever();
+    int new_wanted = m->get_wanted();
      if (new_wanted != cap->wanted()) {
-      if (!need_flush && (new_wanted & ~cap->pending())) {
+      if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
         // exapnding caps.  make sure we aren't waiting for a log flush
         need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
        }
@@ -3534,7 +3534,7 @@ void Locker::remove_client_cap(CInode *in, client_t client)
  
  /**
   * Return true if any currently revoking caps exceed the
- * mds_session_timeout threshold.
+ * session_timeout threshold.
   */
  bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
                                      double timeout) const
@@ -3587,8 +3587,8 @@ void Locker::caps_tick()
  
      utime_t age = now - cap->get_last_revoke_stamp();
      dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
-    if (age <= g_conf->mds_session_timeout) {
-      dout(20) << __func__ << " age below timeout " << g_conf->mds_session_timeout << dendl;
+    if (age <= mds->mdsmap->get_session_timeout()) {
+      dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
        break;
      } else {
        ++i;
@@ -3599,7 +3599,7 @@ void Locker::caps_tick()
        }
      }
      // exponential backoff of warning intervals
-    if (age > g_conf->mds_session_timeout * (1 << cap->get_num_revoke_warnings())) {
+    if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
        cap->inc_num_revoke_warnings();
        stringstream ss;
        ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
@@ -4871,7 +4871,7 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue)
    else if (lock->get_state() != LOCK_SYNC &&
            !lock->is_wrlocked() &&   // drain wrlocks first!
            !lock->is_waiter_for(SimpleLock::WAIT_WR) &&
-          !(wanted & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) &&
+          !(wanted & CEPH_CAP_GWR) &&
            !((lock->get_state() == LOCK_MIX) &&
              in->is_dir() && in->has_subtree_or_exporting_dirfrag())  // if we are a delegation point, stay where we are
            //((wanted & CEPH_CAP_RD) || 
diff --git a/ceph/src/mds/MDBalancer.cc b/ceph/src/mds/MDBalancer.cc

index c3d56bff9d3dc718211391a5c4836d7f5abcd773..39abb57ba60516a9cd03832e175f70eb9fdb5a8b 100644 (file)
--- a/ceph/src/mds/MDBalancer.cc
+++ b/ceph/src/mds/MDBalancer.cc
@@ -1147,7 +1147,7 @@ void MDBalancer::hit_inode(const utime_t& now, CInode *in, int type, int who)
  void MDBalancer::maybe_fragment(CDir *dir, bool hot)
  {
    // split/merge
-  if (g_conf->mds_bal_frag && bal_fragment_interval > 0 &&
+  if (bal_fragment_interval > 0 &&
        dir->is_auth() &&
        !dir->inode->is_base() &&  // not root/base (for now at least)
        !dir->inode->is_stray()) { // not straydir
diff --git a/ceph/src/mds/MDCache.cc b/ceph/src/mds/MDCache.cc

index ddb7ea924e0d39d25ae0c3e3705ac6b13c3b0aec..d7b40a3126f6522e7bfbf3a924ae1b114ddc69c4 100644 (file)
--- a/ceph/src/mds/MDCache.cc
+++ b/ceph/src/mds/MDCache.cc
@@ -399,7 +399,8 @@ void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
    memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
    if (in->inode.is_dir()) {
      in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
-    ++in->inode.rstat.rsubdirs;
+    in->inode.rstat.rsubdirs = 1; /* itself */
+    in->inode.rstat.rctime = in->inode.ctime;
    } else {
      in->inode.layout = default_file_layout;
      ++in->inode.rstat.rfiles;
@@ -446,13 +447,12 @@ void MDCache::create_empty_hierarchy(MDSGather *gather)
    adjust_subtree_auth(rootdir, mds->get_nodeid());   
    rootdir->dir_rep = CDir::REP_ALL;   //NONE;
  
-  rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
-  rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
-
-  root->inode.dirstat = rootdir->fnode.fragstat;
-  root->inode.rstat = rootdir->fnode.rstat;
-  ++root->inode.rstat.rsubdirs;
-  root->inode.accounted_rstat = root->inode.rstat;
+  assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
+  assert(rootdir->fnode.fragstat == root->inode.dirstat);
+  assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
+  /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
+   * assume version 0 is stale/invalid.
+   */
  
    rootdir->mark_complete();
    rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
@@ -3068,7 +3068,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
  
    // MDCache::shutdown_export_strays() always exports strays to mds.0
    if (who == mds_rank_t(0))
-    shutdown_exported_strays.clear();
+    shutdown_exporting_strays.clear();
  
    show_subtrees();  
  }
@@ -7498,7 +7498,7 @@ void MDCache::check_memory_usage()
  
    if (cache_toofull()) {
      last_recall_state = clock::now();
-    mds->server->recall_client_state();
+    mds->server->recall_client_state(-1.0, false, nullptr);
    }
  
    // If the cache size had exceeded its limit, but we're back in bounds
@@ -7642,7 +7642,16 @@ bool MDCache::shutdown_pass()
    // Fully trim the log so that all objects in cache are clean and may be
    // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
    // trim the log such that the cache eventually becomes clean.
-  mds->mdlog->trim(0);
+  if (mds->mdlog->get_num_segments() > 0) {
+    auto ls = mds->mdlog->get_current_segment();
+    if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
+      // Current segment contains events other than subtreemap or
+      // there are dirty dirfrags (see CDir::log_mark_dirty())
+      mds->mdlog->start_new_segment();
+      mds->mdlog->flush();
+    }
+  }
+  mds->mdlog->trim_all();
    if (mds->mdlog->get_num_segments() > 1) {
      dout(7) << "still >1 segments, waiting for log to trim" << dendl;
      return false;
@@ -7675,6 +7684,12 @@ bool MDCache::shutdown_pass()
    assert(!migrator->is_exporting());
    assert(!migrator->is_importing());
  
+  // replicas may dirty scatter locks
+  if (myin && myin->is_replicated()) {
+    dout(7) << "still have replicated objects" << dendl;
+    return false;
+  }
+
    if ((myin && myin->is_auth_pinned()) ||
        (mydir && mydir->is_auth_pinned())) {
      dout(7) << "still have auth pinned objects" << dendl;
@@ -7685,9 +7700,11 @@ bool MDCache::shutdown_pass()
    if (!mds->mdlog->is_capped()) {
      dout(7) << "capping the log" << dendl;
      mds->mdlog->cap();
-    mds->mdlog->trim();
    }
    
+  if (!mds->mdlog->empty())
+    mds->mdlog->trim(0);
+
    if (!mds->mdlog->empty()) {
      dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() 
             << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
@@ -7744,60 +7761,119 @@ bool MDCache::shutdown_pass()
  
  bool MDCache::shutdown_export_strays()
  {
+  static const unsigned MAX_EXPORTING = 100;
+
    if (mds->get_nodeid() == 0)
      return true;
-  
-  dout(10) << "shutdown_export_strays" << dendl;
+
+  if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
+    return false;
+
+  dout(10) << "shutdown_export_strays " << shutdown_export_next.first
+          << " '" << shutdown_export_next.second << "'" << dendl;
  
    bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
+  bool all_exported = false;
  
-  bool done = true;
+again:
+  auto next = shutdown_export_next;
  
-  list<CDir*> dfs;
    for (int i = 0; i < NUM_STRAY; ++i) {
-    if (!strays[i] ||
-       !strays[i]->state_test(CInode::STATE_STRAYPINNED))
+    CInode *strayi = strays[i];
+    if (!strayi ||
+       !strayi->state_test(CInode::STATE_STRAYPINNED))
+      continue;
+    if (strayi->ino() < next.first.ino)
        continue;
-    strays[i]->get_dirfrags(dfs);
-  }
  
-  for (std::list<CDir*>::iterator dfs_i = dfs.begin();
-       dfs_i != dfs.end(); ++dfs_i)
-  {
-    CDir *dir = *dfs_i;
+    deque<CDir*> dfls;
+    strayi->get_dirfrags(dfls);
  
-    if (!dir->is_complete()) {
-      dir->fetch(0);
-      done = false;
-      if (!mds0_active)
-       break;
-    }
-    
-    for (auto &p : dir->items) {
-      CDentry *dn = p.second;
-      CDentry::linkage_t *dnl = dn->get_projected_linkage();
-      if (dnl->is_null())
+    while (!dfls.empty()) {
+      CDir *dir = dfls.front();
+      dfls.pop_front();
+
+      if (dir->dirfrag() < next.first)
         continue;
-      done = false;
-      if (!mds0_active)
-       break;
-      
-      if (dn->state_test(CDentry::STATE_PURGING)) {
-        // Don't try to migrate anything that is actually
-        // being purged right now
-        continue;
+      if (next.first < dir->dirfrag()) {
+       next.first = dir->dirfrag();
+       next.second.clear();
+      }
+
+      if (!dir->is_complete()) {
+       MDSInternalContextBase *fin = nullptr;
+       if (shutdown_exporting_strays.empty()) {
+         fin = new MDSInternalContextWrapper(mds,
+                 new FunctionContext([this](int r) {
+                   shutdown_export_strays();
+                 })
+               );
+       }
+       dir->fetch(fin);
+       goto done;
        }
  
-      if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
-       shutdown_exported_strays.insert(dnl->get_inode()->ino());
-       stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
+      CDir::dentry_key_map::iterator it;
+      if (next.second.empty()) {
+       it = dir->begin();
        } else {
-       dout(10) << "already exporting " << *dn << dendl;
+       auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
+       it = dir->lower_bound(dentry_key_t(0, next.second, hash));
+      }
+
+      for (; it != dir->end(); ++it) {
+       CDentry *dn = it->second;
+       CDentry::linkage_t *dnl = dn->get_projected_linkage();
+       if (dnl->is_null())
+         continue;
+
+       if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
+         next.second = string(it->first.name);
+         goto done;
+       }
+
+       auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
+       if (!ret.second) {
+         dout(10) << "already exporting/purging " << *dn << dendl;
+         continue;
+       }
+
+       // Don't try to migrate anything that is actually
+       // being purged right now
+       if (!dn->state_test(CDentry::STATE_PURGING))
+         stray_manager.migrate_stray(dn, mds_rank_t(0));  // send to root!
+
+       if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
+         ++it;
+         if (it != dir->end()) {
+           next.second = string(it->first.name);
+         } else {
+           if (dfls.empty())
+             next.first.ino.val++;
+           else
+             next.first = dfls.front()->dirfrag();
+           next.second.clear();
+         }
+         goto done;
+       }
        }
      }
    }
  
-  return done;
+  if (shutdown_exporting_strays.empty()) {
+    dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
+    if (first_df < shutdown_export_next.first ||
+       !shutdown_export_next.second.empty()) {
+      shutdown_export_next.first = first_df;
+      shutdown_export_next.second.clear();
+      goto again;
+    }
+    all_exported = true;
+  }
+
+done:
+  shutdown_export_next = next;
+  return all_exported;
  }
  
  // ========= messaging ==============
@@ -11940,7 +12016,7 @@ void MDCache::show_cache()
      show_func(p.second);
  }
  
-int MDCache::cache_status(Formatter *f)
+void MDCache::cache_status(Formatter *f)
  {
    f->open_object_section("cache");
  
@@ -11949,7 +12025,6 @@ int MDCache::cache_status(Formatter *f)
    f->close_section();
  
    f->close_section();
-  return 0;
  }
  
  int MDCache::dump_cache(boost::string_view file_name)
@@ -11975,6 +12050,32 @@ int MDCache::dump_cache(boost::string_view fn, Formatter *f,
                          boost::string_view dump_root, int depth)
  {
    int r = 0;
+
+  // dumping large caches may cause mds to hang or worse get killed.
+  // so, disallow the dump if the cache size exceeds the configured
+  // threshold, which is 1G for formatter and unlimited for file (note
+  // that this can be jacked up by the admin... and is nothing but foot
+  // shooting, but the option itself is for devs and hence dangerous to
+  // tune). TODO: remove this when fixed.
+  uint64_t threshold = f ?
+    g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
+    g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
+
+  if (threshold && cache_size() > threshold) {
+    if (f) {
+      std::stringstream ss;
+      ss << "cache usage exceeds dump threshold";
+      f->open_object_section("result");
+      f->dump_string("error", ss.str());
+      f->close_section();
+    } else {
+      derr << "cache usage exceeds dump threshold" << dendl;
+      r = -EINVAL;
+    }
+    return r;
+  }
+
+  r = 0;
    int fd = -1;
  
    if (f) {
diff --git a/ceph/src/mds/MDCache.h b/ceph/src/mds/MDCache.h

index c39a9a1a632300ad0038b2342e0305235658f232..49d8fc738c72d4ed952a9afd3043c184a8016f3d 100644 (file)
--- a/ceph/src/mds/MDCache.h
+++ b/ceph/src/mds/MDCache.h
@@ -759,13 +759,18 @@ public:
  
    // shutdown
  private:
-  set<inodeno_t> shutdown_exported_strays;
+  set<inodeno_t> shutdown_exporting_strays;
+  pair<dirfrag_t, string> shutdown_export_next;
  public:
    void shutdown_start();
    void shutdown_check();
    bool shutdown_pass();
-  bool shutdown_export_strays();
    bool shutdown();                    // clear cache (ie at shutodwn)
+  bool shutdown_export_strays();
+  void shutdown_export_stray_finish(inodeno_t ino) {
+    if (shutdown_exporting_strays.erase(ino))
+      shutdown_export_strays();
+  }
  
    bool did_shutdown_log_cap;
  
@@ -1176,7 +1181,7 @@ public:
    int dump_cache(Formatter *f);
    int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
  
-  int cache_status(Formatter *f);
+  void cache_status(Formatter *f);
  
    void dump_resolve_status(Formatter *f) const;
    void dump_rejoin_status(Formatter *f) const;
diff --git a/ceph/src/mds/MDLog.cc b/ceph/src/mds/MDLog.cc

index 49297f86c91c5d5e23defb0c1a4d065ca5cbd7ec..1083ab356638bfbe0f36c0be046fb5bd2f81baf8 100644 (file)
--- a/ceph/src/mds/MDLog.cc
+++ b/ceph/src/mds/MDLog.cc
@@ -64,7 +64,8 @@ void MDLog::create_logger()
    plb.add_u64(l_mdl_evexd, "evexd", "Current expired events");
    plb.add_u64(l_mdl_segexg, "segexg", "Expiring segments");
    plb.add_u64(l_mdl_segexd, "segexd", "Current expired segments");
-  plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed");
+  plb.add_u64_counter(l_mdl_replayed, "replayed", "Events replayed",
+                     "repl", PerfCountersBuilder::PRIO_INTERESTING);
    plb.add_time_avg(l_mdl_jlat, "jlat", "Journaler flush latency");
    plb.add_u64_counter(l_mdl_evex, "evex", "Total expired events");
    plb.add_u64_counter(l_mdl_evtrm, "evtrm", "Trimmed events");
@@ -699,7 +700,8 @@ int MDLog::trim_all()
  
    map<uint64_t,LogSegment*>::iterator p = segments.begin();
    while (p != segments.end() &&
-        p->first < last_seq && p->second->end <= safe_pos) {
+        p->first < last_seq &&
+        p->second->end < safe_pos) { // next segment should have been started
      LogSegment *ls = p->second;
      ++p;
  
@@ -862,6 +864,8 @@ void MDLog::replay(MDSInternalContextBase *c)
    if (journaler->get_read_pos() == journaler->get_write_pos()) {
      dout(10) << "replay - journal empty, done." << dendl;
      mds->mdcache->trim();
+    if (mds->is_standby_replay())
+      mds->update_mlogger();
      if (c) {
        c->complete(0);
      }
diff --git a/ceph/src/mds/MDSDaemon.cc b/ceph/src/mds/MDSDaemon.cc

index 332ef1106d0951ce244cb32a211ed641a35268df..2f55ef825008282ce2002ff36cc0e189c68d1d25 100644 (file)
--- a/ceph/src/mds/MDSDaemon.cc
+++ b/ceph/src/mds/MDSDaemon.cc
@@ -251,6 +251,11 @@ void MDSDaemon::set_up_admin_socket()
                                       asok_hook,
                                       "show cache status");
    assert(r == 0);
+  r = admin_socket->register_command("cache drop",
+                                     "cache drop name=timeout,type=CephInt,range=0,req=false",
+                                     asok_hook,
+                                     "drop cache");
+  assert(r == 0);
    r = admin_socket->register_command("dump tree",
                                      "dump tree "
                                      "name=root,type=CephString,req=true "
@@ -367,6 +372,8 @@ const char** MDSDaemon::get_tracked_conf_keys() const
      "mds_cache_reservation",
      "mds_health_cache_threshold",
      "mds_cache_mid",
+    "mds_dump_cache_threshold_formatter",
+    "mds_dump_cache_threshold_file",
      // MDBalancer
      "mds_bal_fragment_interval",
      // PurgeQueue
@@ -648,7 +655,6 @@ void MDSDaemon::handle_command(MCommand *m)
    m->put();
  }
  
-
  struct MDSCommand {
    string cmdstring;
    string helpstring;
@@ -702,6 +708,9 @@ COMMAND("heap " \
         "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
         "show heap usage info (available only if compiled with tcmalloc)", \
         "mds", "*", "cli,rest")
+COMMAND("cache drop name=timeout,type=CephInt,range=0,req=false", "trim cache and optionally "
+       "request client to release all caps and flush the journal", "mds",
+       "r", "cli,rest")
  };
  
  
@@ -856,7 +865,7 @@ int MDSDaemon::_handle_command(
      }
      else {
        bool handled = mds_rank->handle_command(cmdmap, m, &r, &ds, &ss,
-                                             need_reply);
+                                             run_later, need_reply);
        if (!handled) {
          // MDSDaemon doesn't know this command
          ss << "unrecognized command! " << prefix;
@@ -1056,8 +1065,11 @@ void MDSDaemon::suicide()
  
    //because add_observer is called after set_up_admin_socket
    //so we can use asok_hook to avoid assert in the remove_observer
-  if (asok_hook != NULL)
+  if (asok_hook != NULL) {
+    mds_lock.Unlock();
      g_conf->remove_observer(this);
+    mds_lock.Lock();
+  }
  
    clean_up_admin_socket();
  
diff --git a/ceph/src/mds/MDSMap.h b/ceph/src/mds/MDSMap.h

index 86a538ce476ec2e989cf07460bcd934eed8b47bc..3b4ef75f6e40aefd3eeeba29f8b919fec2c2cc0d 100644 (file)
--- a/ceph/src/mds/MDSMap.h
+++ b/ceph/src/mds/MDSMap.h
@@ -232,10 +232,16 @@ public:
    utime_t get_session_timeout() const {
      return utime_t(session_timeout,0);
    }
+  void set_session_timeout(uint32_t t) {
+    session_timeout = t;
+  }
  
    utime_t get_session_autoclose() const {
      return utime_t(session_autoclose, 0);
    }
+  void set_session_autoclose(uint32_t t) {
+    session_autoclose = t;
+  }
  
    uint64_t get_max_filesize() const { return max_file_size; }
    void set_max_filesize(uint64_t m) { max_file_size = m; }
@@ -635,6 +641,16 @@ public:
      }
    }
  
+  /**
+   * Get MDS rank incarnation if the rank is up, else -1
+   */
+  mds_gid_t get_incarnation(mds_rank_t m) const {
+    std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m);
+    if (u == up.end())
+      return MDS_GID_NONE;
+    return (mds_gid_t)get_inc_gid(u->second);
+  }
+
    int get_inc_gid(mds_gid_t gid) const {
      auto mds_info_entry = mds_info.find(gid);
      if (mds_info_entry != mds_info.end())
diff --git a/ceph/src/mds/MDSRank.cc b/ceph/src/mds/MDSRank.cc

index 1e8b024b8aeb6950f73649ee7491a22cf4830454..b196b5044ba280f43575952b86f6346909699c3c 100644 (file)
--- a/ceph/src/mds/MDSRank.cc
+++ b/ceph/src/mds/MDSRank.cc
@@ -44,6 +44,391 @@
  #undef dout_prefix
  #define dout_prefix *_dout << "mds." << whoami << '.' << incarnation << ' '
  
+class C_Flush_Journal : public MDSInternalContext {
+public:
+  C_Flush_Journal(MDCache *mdcache, MDLog *mdlog, MDSRank *mds,
+                  std::ostream *ss, Context *on_finish)
+    : MDSInternalContext(mds),
+      mdcache(mdcache), mdlog(mdlog), ss(ss), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    assert(mds->mds_lock.is_locked());
+
+    dout(20) << __func__ << dendl;
+
+    if (mdcache->is_readonly()) {
+      dout(5) << __func__ << ": read-only FS" << dendl;
+      complete(-EROFS);
+      return;
+    }
+
+    if (!mds->is_active()) {
+      dout(5) << __func__ << ": MDS not active, no-op" << dendl;
+      complete(0);
+      return;
+    }
+
+    flush_mdlog();
+  }
+
+private:
+
+  void flush_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    // I need to seal off the current segment, and then mark all
+    // previous segments for expiry
+    mdlog->start_new_segment();
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_flush_mdlog(r);
+      });
+
+    // Flush initially so that all the segments older than our new one
+    // will be elegible for expiry
+    mdlog->flush();
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_flush_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    clear_mdlog();
+  }
+
+  void clear_mdlog() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_clear_mdlog(r);
+      });
+
+    // Because we may not be the last wait_for_safe context on MDLog,
+    // and subsequent contexts might wake up in the middle of our
+    // later trim_all and interfere with expiry (by e.g. marking
+    // dirs/dentries dirty on previous log segments), we run a second
+    // wait_for_safe here. See #10368
+    mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, ctx));
+  }
+
+  void handle_clear_mdlog(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
+      complete(r);
+      return;
+    }
+
+    trim_mdlog();
+  }
+
+  void trim_mdlog() {
+    // Put all the old log segments into expiring or expired state
+    dout(5) << __func__ << ": beginning segment expiry" << dendl;
+
+    int ret = mdlog->trim_all();
+    if (ret != 0) {
+      *ss << "Error " << ret << " (" << cpp_strerror(ret) << ") while trimming log";
+      complete(ret);
+      return;
+    }
+
+    expire_segments();
+  }
+
+  void expire_segments() {
+    dout(20) << __func__ << dendl;
+
+    // Attach contexts to wait for all expiring segments to expire
+    MDSGatherBuilder *expiry_gather = new MDSGatherBuilder(g_ceph_context);
+
+    const auto &expiring_segments = mdlog->get_expiring_segments();
+    for (auto p : expiring_segments) {
+      p->wait_for_expiry(expiry_gather->new_sub());
+    }
+    dout(5) << __func__ << ": waiting for " << expiry_gather->num_subs_created()
+            << " segments to expire" << dendl;
+
+    if (!expiry_gather->has_subs()) {
+      trim_segments();
+      delete expiry_gather;
+      return;
+    }
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_expire_segments(r);
+      });
+    expiry_gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+    expiry_gather->activate();
+  }
+
+  void handle_expire_segments(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    ceph_assert(r == 0); // MDLog is not allowed to raise errors via
+                         // wait_for_expiry
+    trim_segments();
+  }
+
+  void trim_segments() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new C_OnFinisher(new FunctionContext([this](int _) {
+          Mutex::Locker locker(mds->mds_lock);
+          trim_expired_segments();
+        }), mds->finisher);
+    ctx->complete(0);
+  }
+
+  void trim_expired_segments() {
+    dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    // Now everyone I'm interested in is expired
+    mdlog->trim_expired_segments();
+
+    dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now "
+            << std::hex << mdlog->get_journaler()->get_expire_pos() << "/"
+            << mdlog->get_journaler()->get_trimmed_pos() << dendl;
+
+    write_journal_head();
+  }
+
+  void write_journal_head() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        Mutex::Locker locker(mds->mds_lock);
+        handle_write_head(r);
+      });
+    // Flush the journal header so that readers will start from after
+    // the flushed region
+    mdlog->get_journaler()->write_head(ctx);
+  }
+
+  void handle_write_head(int r) {
+    if (r != 0) {
+      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
+    } else {
+      dout(5) << __func__ << ": write_head complete, all done!" << dendl;
+    }
+
+    complete(r);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+    on_finish->complete(r);
+  }
+
+  MDCache *mdcache;
+  MDLog *mdlog;
+  std::ostream *ss;
+  Context *on_finish;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+};
+
+class C_Drop_Cache : public MDSInternalContext {
+public:
+  C_Drop_Cache(Server *server, MDCache *mdcache, MDLog *mdlog,
+               MDSRank *mds, uint64_t recall_timeout,
+               Formatter *f, Context *on_finish)
+    : MDSInternalContext(mds),
+      server(server), mdcache(mdcache), mdlog(mdlog),
+      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      whoami(mds->whoami), incarnation(mds->incarnation) {
+  }
+
+  void send() {
+    // not really a hard requirement here, but lets ensure this in
+    // case we change the logic here.
+    assert(mds->mds_lock.is_locked());
+
+    dout(20) << __func__ << dendl;
+    recall_client_state();
+  }
+
+private:
+  // context which completes itself (with -ETIMEDOUT) after a specified
+  // timeout or when explicitly completed, whichever comes first. Note
+  // that the context does not detroy itself after completion -- it
+  // needs to be explicitly freed.
+  class C_ContextTimeout : public MDSInternalContext {
+  public:
+    C_ContextTimeout(MDSRank *mds, uint64_t timeout, Context *on_finish)
+      : MDSInternalContext(mds),
+        timeout(timeout),
+        lock("mds::context::timeout", false, true),
+        on_finish(on_finish) {
+    }
+    ~C_ContextTimeout() {
+      ceph_assert(timer_task == nullptr);
+    }
+
+    void start_timer() {
+      if (!timeout) {
+        return;
+      }
+
+      timer_task = new FunctionContext([this](int _) {
+          timer_task = nullptr;
+          complete(-ETIMEDOUT);
+        });
+      mds->timer.add_event_after(timeout, timer_task);
+    }
+
+    void finish(int r) override {
+      Context *ctx = nullptr;
+      {
+        Mutex::Locker locker(lock);
+        std::swap(on_finish, ctx);
+      }
+      if (ctx != nullptr) {
+        ctx->complete(r);
+      }
+    }
+    void complete(int r) override {
+      if (timer_task != nullptr) {
+        mds->timer.cancel_event(timer_task);
+      }
+
+      finish(r);
+    }
+
+    uint64_t timeout;
+    Mutex lock;
+    Context *on_finish = nullptr;
+    Context *timer_task = nullptr;
+  };
+
+  void recall_client_state() {
+    dout(20) << __func__ << dendl;
+
+    f->open_object_section("result");
+
+    MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
+    server->recall_client_state(1.0, true, gather);
+    if (!gather->has_subs()) {
+      handle_recall_client_state(0);
+      delete gather;
+      return;
+    }
+
+    C_ContextTimeout *ctx = new C_ContextTimeout(
+      mds, recall_timeout, new FunctionContext([this](int r) {
+          handle_recall_client_state(r);
+        }));
+
+    ctx->start_timer();
+    gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+    gather->activate();
+  }
+
+  void handle_recall_client_state(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    // client recall section
+    f->open_object_section("client_recall");
+    f->dump_int("return_code", r);
+    f->dump_string("message", cpp_strerror(r));
+    f->close_section();
+
+    // we can still continue after recall timeout
+    trim_cache();
+  }
+
+  void trim_cache() {
+    dout(20) << __func__ << dendl;
+
+    if (!mdcache->trim(UINT64_MAX)) {
+      cmd_err(f, "failed to trim cache");
+      complete(-EINVAL);
+      return;
+    }
+
+    flush_journal();
+  }
+
+  void flush_journal() {
+    dout(20) << __func__ << dendl;
+
+    Context *ctx = new FunctionContext([this](int r) {
+        handle_flush_journal(r);
+      });
+
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, mds, &ss, ctx);
+    flush_journal->send();
+  }
+
+  void handle_flush_journal(int r) {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    if (r != 0) {
+      cmd_err(f, ss.str());
+      complete(r);
+      return;
+    }
+
+    // journal flush section
+    f->open_object_section("flush_journal");
+    f->dump_int("return_code", r);
+    f->dump_string("message", ss.str());
+    f->close_section();
+
+    cache_status();
+  }
+
+  void cache_status() {
+    dout(20) << __func__ << dendl;
+
+    // cache status section
+    mdcache->cache_status(f);
+    f->close_section();
+
+    complete(0);
+  }
+
+  void finish(int r) override {
+    dout(20) << __func__ << ": r=" << r << dendl;
+
+    on_finish->complete(r);
+  }
+
+  Server *server;
+  MDCache *mdcache;
+  MDLog *mdlog;
+  uint64_t recall_timeout;
+  Formatter *f;
+  Context *on_finish;
+
+  int retval = 0;
+  std::stringstream ss;
+
+  // so as to use dout
+  mds_rank_t whoami;
+  int incarnation;
+
+  void cmd_err(Formatter *f, boost::string_view err) {
+    f->reset();
+    f->open_object_section("result");
+    f->dump_string("error", err);
+    f->close_section();
+  }
+};
+
  MDSRank::MDSRank(
      mds_rank_t whoami_,
      Mutex &mds_lock_,
@@ -77,9 +462,10 @@ MDSRank::MDSRank(
            // Purge Queue operates inside mds_lock when we're calling into
            // it, and outside when in background, so must handle both cases.
            if (mds_lock.is_locked_by_me()) {
-            damaged();
+            handle_write_error(r);
            } else {
-            damaged_unlocked();
+            Mutex::Locker l(mds_lock);
+            handle_write_error(r);
            }
          }
        )
@@ -525,9 +911,14 @@ void MDSRank::ProgressThread::shutdown()
  
  bool MDSRankDispatcher::ms_dispatch(Message *m)
  {
-  bool ret;
+  if (m->get_source().is_client()) {
+    Session *session = static_cast<Session*>(m->get_connection()->get_priv());
+    if (session)
+      session->last_seen = Session::clock::now();
+  }
+
    inc_dispatch_depth();
-  ret = _dispatch(m, true);
+  bool ret = _dispatch(m, true);
    dec_dispatch_depth();
    return ret;
  }
@@ -819,7 +1210,8 @@ void MDSRank::heartbeat_reset()
    // NB not enabling suicide grace, because the mon takes care of killing us
    // (by blacklisting us) when we fail to send beacons, and it's simpler to
    // only have one way of dying.
-  g_ceph_context->get_heartbeat_map()->reset_timeout(hb, g_conf->mds_beacon_grace, 0);
+  auto grace = g_conf->get_val<double>("mds_heartbeat_grace");
+  g_ceph_context->get_heartbeat_map()->reset_timeout(hb, grace, 0);
  }
  
  bool MDSRank::is_stale_message(Message *m) const
@@ -872,6 +1264,7 @@ Session *MDSRank::get_session(Message *m)
          imported_session->auth_caps = session->auth_caps;
          assert(session->get_nref() == 1);
          imported_session->connection->set_priv(imported_session->get());
+        imported_session->last_seen = session->last_seen;
          session = imported_session;
        }
      }
@@ -1062,6 +1455,8 @@ void MDSRank::boot_start(BootStep step, int r)
          << cpp_strerror(r);
        damaged();
        assert(r == 0);  // Unreachable, damaged() calls respawn()
+    } else if (r == -EROFS) {
+      dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl;
      } else {
        // Completely unexpected error, give up and die
        dout(0) << "boot_start encountered an error, failing" << dendl;
@@ -1271,7 +1666,7 @@ void MDSRank::standby_replay_restart()
  {
    if (standby_replaying) {
      /* Go around for another pass of replaying in standby */
-    dout(4) << "standby_replay_restart (as standby)" << dendl;
+    dout(5) << "Restarting replay as standby-replay" << dendl;
      mdlog->get_journaler()->reread_head_and_probe(
        new C_MDS_StandbyReplayRestartFinish(
          this,
@@ -1300,7 +1695,11 @@ void MDSRank::standby_replay_restart()
  
  void MDSRank::replay_done()
  {
-  dout(1) << "replay_done" << (standby_replaying ? " (as standby)" : "") << dendl;
+  if (!standby_replaying) {
+    dout(1) << "Finished replaying journal" << dendl;
+  } else {
+    dout(5) << "Finished replaying journal as standby-replay" << dendl;
+  }
  
    if (is_standby_replay()) {
      // The replay was done in standby state, and we are still in that state
@@ -1322,7 +1721,7 @@ void MDSRank::replay_done()
  
      // Reformat and come back here
      if (mdlog->get_journaler()->get_stream_format() < g_conf->mds_journal_format) {
-        dout(4) << "reformatting journal on standbyreplay->replay transition" << dendl;
+        dout(4) << "reformatting journal on standby-replay->replay transition" << dendl;
          mdlog->reopen(new C_MDS_BootStart(this, MDS_BOOT_REPLAY_DONE));
          return;
      }
@@ -2012,9 +2411,18 @@ bool MDSRankDispatcher::handle_asok_command(
      }
    } else if (command == "cache status") {
      Mutex::Locker l(mds_lock);
-    int r = mdcache->cache_status(f);
+    mdcache->cache_status(f);
+  } else if (command == "cache drop") {
+    int64_t timeout;
+    if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+      timeout = 0;
+    }
+
+    C_SaferCond cond;
+    command_cache_drop((uint64_t)timeout, f, &cond);
+    int r = cond.wait();
      if (r != 0) {
-      ss << "Failed to get cache status: " << cpp_strerror(r);
+      f->flush(ss);
      }
    } else if (command == "dump tree") {
      string root;
@@ -2053,19 +2461,26 @@ bool MDSRankDispatcher::handle_asok_command(
    return true;
  }
  
-class C_MDS_Send_Command_Reply : public MDSInternalContext
-{
+class C_MDS_Send_Command_Reply : public MDSInternalContext {
  protected:
    MCommand *m;
  public:
    C_MDS_Send_Command_Reply(MDSRank *_mds, MCommand *_m) :
      MDSInternalContext(_mds), m(_m) { m->get(); }
-  void send (int r, boost::string_view out_str) {
+
+  void send(int r, boost::string_view ss) {
+    std::stringstream ds;
+    send(r, ss, ds);
+  }
+
+  void send(int r, boost::string_view ss, std::stringstream &ds) {
      bufferlist bl;
-    MDSDaemon::send_command_reply(m, mds, r, bl, out_str);
+    bl.append(ds);
+    MDSDaemon::send_command_reply(m, mds, r, bl, ss);
      m->put();
    }
-  void finish (int r) override {
+
+  void finish(int r) override {
      send(r, "");
    }
  };
@@ -2206,142 +2621,27 @@ void MDSRank::command_flush_path(Formatter *f, boost::string_view path)
    f->close_section(); // results
  }
  
-/**
- * Wrapper around _command_flush_journal that
- * handles serialization of result
- */
-void MDSRank::command_flush_journal(Formatter *f)
-{
-  assert(f != NULL);
+// synchronous wrapper around "journal flush" asynchronous context
+// execution.
+void MDSRank::command_flush_journal(Formatter *f) {
+  ceph_assert(f != NULL);
  
+  C_SaferCond cond;
    std::stringstream ss;
-  const int r = _command_flush_journal(&ss);
-  f->open_object_section("result");
-  f->dump_string("message", ss.str());
-  f->dump_int("return_code", r);
-  f->close_section();
-}
-
-/**
- * Implementation of "flush journal" asok command.
- *
- * @param ss
- * Optionally populate with a human readable string describing the
- * reason for any unexpected return status.
- */
-int MDSRank::_command_flush_journal(std::stringstream *ss)
-{
-  assert(ss != NULL);
  
-  Mutex::Locker l(mds_lock);
-
-  if (mdcache->is_readonly()) {
-    dout(5) << __func__ << ": read-only FS" << dendl;
-    return -EROFS;
-  }
-
-  if (!is_active()) {
-    dout(5) << __func__ << ": MDS not active, no-op" << dendl;
-    return 0;
-  }
-
-  // I need to seal off the current segment, and then mark all previous segments
-  // for expiry
-  mdlog->start_new_segment();
-  int r = 0;
-
-  // Flush initially so that all the segments older than our new one
-  // will be elegible for expiry
-  {
-    C_SaferCond mdlog_flushed;
-    mdlog->flush();
-    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_flushed));
-    mds_lock.Unlock();
-    r = mdlog_flushed.wait();
-    mds_lock.Lock();
-    if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
-      return r;
-    }
-  }
-
-  // Because we may not be the last wait_for_safe context on MDLog, and
-  // subsequent contexts might wake up in the middle of our later trim_all
-  // and interfere with expiry (by e.g. marking dirs/dentries dirty
-  // on previous log segments), we run a second wait_for_safe here.
-  // See #10368
    {
-    C_SaferCond mdlog_cleared;
-    mdlog->wait_for_safe(new MDSInternalContextWrapper(this, &mdlog_cleared));
-    mds_lock.Unlock();
-    r = mdlog_cleared.wait();
-    mds_lock.Lock();
-    if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while flushing journal";
-      return r;
-    }
+    Mutex::Locker locker(mds_lock);
+    C_Flush_Journal *flush_journal = new C_Flush_Journal(mdcache, mdlog, this, &ss, &cond);
+    flush_journal->send();
    }
+  int r = cond.wait();
  
-  // Put all the old log segments into expiring or expired state
-  dout(5) << __func__ << ": beginning segment expiry" << dendl;
-  r = mdlog->trim_all();
-  if (r != 0) {
-    *ss << "Error " << r << " (" << cpp_strerror(r) << ") while trimming log";
-    return r;
-  }
-
-  // Attach contexts to wait for all expiring segments to expire
-  MDSGatherBuilder expiry_gather(g_ceph_context);
-
-  const std::set<LogSegment*> &expiring_segments = mdlog->get_expiring_segments();
-  for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
-       i != expiring_segments.end(); ++i) {
-    (*i)->wait_for_expiry(expiry_gather.new_sub());
-  }
-  dout(5) << __func__ << ": waiting for " << expiry_gather.num_subs_created()
-          << " segments to expire" << dendl;
-
-  if (expiry_gather.has_subs()) {
-    C_SaferCond cond;
-    expiry_gather.set_finisher(new MDSInternalContextWrapper(this, &cond));
-    expiry_gather.activate();
-
-    // Drop mds_lock to allow progress until expiry is complete
-    mds_lock.Unlock();
-    int r = cond.wait();
-    mds_lock.Lock();
-
-    assert(r == 0);  // MDLog is not allowed to raise errors via wait_for_expiry
-  }
-
-  dout(5) << __func__ << ": expiry complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Now everyone I'm interested in is expired
-  mdlog->trim_expired_segments();
-
-  dout(5) << __func__ << ": trim complete, expire_pos/trim_pos is now " << std::hex <<
-    mdlog->get_journaler()->get_expire_pos() << "/" <<
-    mdlog->get_journaler()->get_trimmed_pos() << dendl;
-
-  // Flush the journal header so that readers will start from after the flushed region
-  C_SaferCond wrote_head;
-  mdlog->get_journaler()->write_head(&wrote_head);
-  mds_lock.Unlock();  // Drop lock to allow messenger dispatch progress
-  r = wrote_head.wait();
-  mds_lock.Lock();
-  if (r != 0) {
-      *ss << "Error " << r << " (" << cpp_strerror(r) << ") while writing header";
-      return r;
-  }
-
-  dout(5) << __func__ << ": write_head complete, all done!" << dendl;
-
-  return 0;
+  f->open_object_section("result");
+  f->dump_string("message", ss.str());
+  f->dump_int("return_code", r);
+  f->close_section();
  }
  
-
  void MDSRank::command_get_subtrees(Formatter *f)
  {
    assert(f != NULL);
@@ -2895,6 +3195,7 @@ bool MDSRankDispatcher::handle_command(
    int *r,
    std::stringstream *ds,
    std::stringstream *ss,
+  Context **run_later,
    bool *need_reply)
  {
    assert(r != nullptr);
@@ -2916,10 +3217,9 @@ bool MDSRankDispatcher::handle_command(
        return true;
      }
  
-    Formatter *f = new JSONFormatter(true);
-    dump_sessions(filter, f);
-    f->flush(*ds);
-    delete f;
+    JSONFormatter f(true);
+    dump_sessions(filter, &f);
+    f.flush(*ds);
      return true;
    } else if (prefix == "session evict" || prefix == "client evict") {
      std::vector<std::string> filter_args;
@@ -2936,10 +3236,9 @@ bool MDSRankDispatcher::handle_command(
      *need_reply = false;
      return true;
    } else if (prefix == "damage ls") {
-    Formatter *f = new JSONFormatter(true);
-    damage_table.dump(f);
-    f->flush(*ds);
-    delete f;
+    JSONFormatter f(true);
+    damage_table.dump(&f);
+    f.flush(*ds);
      return true;
    } else if (prefix == "damage rm") {
      damage_entry_id_t id = 0;
@@ -2950,12 +3249,56 @@ bool MDSRankDispatcher::handle_command(
      }
  
      damage_table.erase(id);
+    return true;
+  } else if (prefix == "cache drop") {
+    int64_t timeout;
+    if (!cmd_getval(g_ceph_context, cmdmap, "timeout", timeout)) {
+      timeout = 0;
+    }
+
+    JSONFormatter *f = new JSONFormatter(true);
+    C_MDS_Send_Command_Reply *reply = new C_MDS_Send_Command_Reply(this, m);
+    Context *on_finish = new FunctionContext([this, f, reply](int r) {
+        cache_drop_send_reply(f, reply, r);
+        delete f;
+        delete reply;
+      });
+
+    *need_reply = false;
+    *run_later = new C_OnFinisher(
+      new FunctionContext([this, timeout, f, on_finish](int _) {
+          command_cache_drop((uint64_t)timeout, f, on_finish);
+        }), finisher);
+
      return true;
    } else {
      return false;
    }
  }
  
+void MDSRank::cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r) {
+  dout(20) << __func__ << ": r=" << r << dendl;
+
+  std::stringstream ds;
+  std::stringstream ss;
+  if (r != 0) {
+    f->flush(ss);
+  } else {
+    f->flush(ds);
+  }
+
+  reply->send(r, ss.str(), ds);
+}
+
+void MDSRank::command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish) {
+  dout(20) << __func__ << dendl;
+
+  Mutex::Locker locker(mds_lock);
+  C_Drop_Cache *request = new C_Drop_Cache(server, mdcache, mdlog, this,
+                                           timeout, f, on_finish);
+  request->send();
+}
+
  epoch_t MDSRank::get_osd_epoch() const
  {
    return objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));  
diff --git a/ceph/src/mds/MDSRank.h b/ceph/src/mds/MDSRank.h

index faac81dba19654ec70d8ccadad8e7c8e5f0e83c1..46f72a4b964bb1644f0daa030954661ddbc390a9 100644 (file)
--- a/ceph/src/mds/MDSRank.h
+++ b/ceph/src/mds/MDSRank.h
@@ -114,6 +114,7 @@ class MonClient;
  class Finisher;
  class MMDSMap;
  class ScrubStack;
+class C_MDS_Send_Command_Reply;
  
  /**
   * The public part of this class's interface is what's exposed to all
@@ -129,6 +130,10 @@ class MDSRank {
      int incarnation;
  
    public:
+
+    friend class C_Flush_Journal;
+    friend class C_Drop_Cache;
+
      mds_rank_t get_nodeid() const { return whoami; }
      int64_t get_metadata_pool();
  
@@ -465,11 +470,13 @@ class MDSRank {
          std::ostream &ss,
          Formatter *f);
      int _command_export_dir(boost::string_view path, mds_rank_t dest);
-    int _command_flush_journal(std::stringstream *ss);
      CDir *_command_dirfrag_get(
          const cmdmap_t &cmdmap,
          std::ostream &ss);
  
+    void cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r);
+    void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
+
    protected:
      Messenger    *messenger;
      MonClient    *monc;
@@ -584,6 +591,7 @@ public:
      int *r,
      std::stringstream *ds,
      std::stringstream *ss,
+    Context **run_later,
      bool *need_reply);
  
    void dump_sessions(const SessionFilter &filter, Formatter *f) const;
diff --git a/ceph/src/mds/PurgeQueue.cc b/ceph/src/mds/PurgeQueue.cc

index fa5d43f6229f584c25e935b067c05aac15dd57a8..329c60e743e31a2e3cb79fa423e5513a98597d7e 100644 (file)
--- a/ceph/src/mds/PurgeQueue.cc
+++ b/ceph/src/mds/PurgeQueue.cc
@@ -19,6 +19,7 @@
  
  #include "PurgeQueue.h"
  
+#include <string.h>
  
  #define dout_context cct
  #define dout_subsys ceph_subsys_mds
@@ -93,6 +94,7 @@ PurgeQueue::~PurgeQueue()
    if (logger) {
      g_ceph_context->get_perfcounters_collection()->remove(logger.get());
    }
+  delete on_error;
  }
  
  void PurgeQueue::create_logger()
@@ -123,6 +125,12 @@ void PurgeQueue::init()
  void PurgeQueue::activate()
  {
    Mutex::Locker l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping activate: PurgeQueue is readonly" << dendl;
+    return;
+  }
+
    if (journaler.get_read_pos() == journaler.get_write_pos())
      return;
  
@@ -177,7 +185,7 @@ void PurgeQueue::open(Context *completion)
        finish_contexts(g_ceph_context, waiting_for_recovery);
      } else {
        derr << "Error " << r << " loading Journaler" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
      }
    }));
  }
@@ -185,10 +193,14 @@ void PurgeQueue::open(Context *completion)
  void PurgeQueue::wait_for_recovery(Context* c)
  {
    Mutex::Locker l(lock);
-  if (recovered)
+  if (recovered) {
      c->complete(0);
-  else
+  } else if (readonly) {
+    dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl;
+    c->complete(-EROFS);
+  } else {
      waiting_for_recovery.push_back(c);
+  }
  }
  
  void PurgeQueue::_recover()
@@ -210,7 +222,7 @@ void PurgeQueue::_recover()
      if (journaler.get_error()) {
        int r = journaler.get_error();
        derr << "Error " << r << " recovering write_pos" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
        return;
      }
  
@@ -244,8 +256,12 @@ void PurgeQueue::create(Context *fin)
    journaler.create(&layout, JOURNAL_FORMAT_RESILIENT);
    journaler.write_head(new FunctionContext([this](int r) {
      Mutex::Locker l(lock);
-    recovered = true;
-    finish_contexts(g_ceph_context, waiting_for_recovery);
+    if (r) {
+      _go_readonly(r);
+    } else {
+      recovered = true;
+      finish_contexts(g_ceph_context, waiting_for_recovery);
+    }
    }));
  }
  
@@ -257,6 +273,12 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion)
    dout(4) << "pushing inode 0x" << std::hex << pi.ino << std::dec << dendl;
    Mutex::Locker l(lock);
  
+  if (readonly) {
+    dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl;
+    completion->complete(-EROFS);
+    return;
+  }
+
    // Callers should have waited for open() before using us
    assert(!journaler.is_readonly());
  
@@ -271,7 +293,7 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion)
    if (!could_consume) {
      // Usually, it is not necessary to explicitly flush here, because the reader
      // will get flushes generated inside Journaler::is_readable.  However,
-    // if we remain in a can_consume()==false state for a long period then
+    // if we remain in a _can_consume()==false state for a long period then
      // we should flush in order to allow MDCache to drop its strays rather
      // than having them wait for purgequeue to progress.
      if (!delayed_flush) {
@@ -317,8 +339,13 @@ uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const
    return ops_required;
  }
  
-bool PurgeQueue::can_consume()
+bool PurgeQueue::_can_consume()
  {
+  if (readonly) {
+    dout(10) << "can't consume: PurgeQueue is readonly" << dendl;
+    return false;
+  }
+
    dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, "
             << in_flight.size() << "/" << g_conf->mds_max_purge_files
             << " files" << dendl;
@@ -346,12 +373,23 @@ bool PurgeQueue::can_consume()
    }
  }
  
+void PurgeQueue::_go_readonly(int r)
+{
+  if (readonly) return;
+  dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl;
+  readonly = true;
+  on_error->complete(r);
+  on_error = nullptr;
+  journaler.set_readonly();
+  finish_contexts(g_ceph_context, waiting_for_recovery, r);
+}
+
  bool PurgeQueue::_consume()
  {
    assert(lock.is_locked_by_me());
  
    bool could_consume = false;
-  while(can_consume()) {
+  while(_can_consume()) {
  
      if (delayed_flush) {
        // We are now going to read from the journal, so any proactive
@@ -363,7 +401,7 @@ bool PurgeQueue::_consume()
  
      if (int r = journaler.get_error()) {
        derr << "Error " << r << " recovering write_pos" << dendl;
-      on_error->complete(r);
+      _go_readonly(r);
        return could_consume;
      }
  
@@ -377,7 +415,7 @@ bool PurgeQueue::_consume()
            if (r == 0) {
              _consume();
            } else if (r != -EAGAIN) {
-            on_error->complete(r);
+            _go_readonly(r);
            }
          }));
        }
@@ -399,7 +437,7 @@ bool PurgeQueue::_consume()
      } catch (const buffer::error &err) {
        derr << "Decode error at read_pos=0x" << std::hex
             << journaler.get_read_pos() << dendl;
-      on_error->complete(0);
+      _go_readonly(EIO);
      }
      dout(20) << " executing item (0x" << std::hex << item.ino
               << std::dec << ")" << dendl;
@@ -419,7 +457,8 @@ void PurgeQueue::_execute_item(
  
    in_flight[expire_to] = item;
    logger->set(l_pq_executing, in_flight.size());
-  ops_in_flight += _calculate_ops(item);
+  auto ops = _calculate_ops(item);
+  ops_in_flight += ops;
    logger->set(l_pq_executing_ops, ops_in_flight);
  
    SnapContext nullsnapc;
@@ -486,6 +525,8 @@ void PurgeQueue::_execute_item(
    } else {
      derr << "Invalid item (action=" << item.action << ") in purge queue, "
              "dropping it" << dendl;
+    ops_in_flight -= ops;
+    logger->set(l_pq_executing_ops, ops_in_flight);
      in_flight.erase(expire_to);
      logger->set(l_pq_executing, in_flight.size());
      return;
@@ -505,9 +546,7 @@ void PurgeQueue::_execute_item(
      // expire_pos doesn't fall too far behind our progress when consuming
      // a very long queue.
      if (in_flight.empty() || journaler.write_head_needed()) {
-      journaler.write_head(new FunctionContext([this](int r){
-            journaler.trim();
-            }));
+      journaler.write_head(nullptr);
      }
    }), &finisher));
  
@@ -551,6 +590,11 @@ void PurgeQueue::update_op_limit(const MDSMap &mds_map)
  {
    Mutex::Locker l(lock);
  
+  if (readonly) {
+    dout(10) << "skipping; PurgeQueue is readonly" << dendl;
+    return;
+  }
+
    uint64_t pg_count = 0;
    objecter->with_osdmap([&](const OSDMap& o) {
      // Number of PGs across all data pools
@@ -607,6 +651,13 @@ bool PurgeQueue::drain(
      size_t *in_flight_count
      )
  {
+  Mutex::Locker l(lock);
+
+  if (readonly) {
+    dout(10) << "skipping drain; PurgeQueue is readonly" << dendl;
+    return true;
+  }
+
    assert(progress != nullptr);
    assert(progress_total != nullptr);
    assert(in_flight_count != nullptr);
diff --git a/ceph/src/mds/PurgeQueue.h b/ceph/src/mds/PurgeQueue.h

index 6a13a57ee30360f14d913d0abc8166718d95f99d..eac9a1a46f62780243b4145030a8603c3e3708fd 100644 (file)
--- a/ceph/src/mds/PurgeQueue.h
+++ b/ceph/src/mds/PurgeQueue.h
@@ -77,6 +77,7 @@ protected:
    CephContext *cct;
    const mds_rank_t rank;
    Mutex lock;
+  bool readonly = false;
  
    int64_t metadata_pool;
  
@@ -103,7 +104,7 @@ protected:
  
    uint32_t _calculate_ops(const PurgeItem &item) const;
  
-  bool can_consume();
+  bool _can_consume();
  
    // How many bytes were remaining when drain() was first called,
    // used for indicating progress.
@@ -133,6 +134,8 @@ protected:
    bool recovered;
    std::list<Context*> waiting_for_recovery;
  
+  void _go_readonly(int r);
+
  public:
    void init();
    void activate();
diff --git a/ceph/src/mds/Server.cc b/ceph/src/mds/Server.cc

index 04a9e3e386b4364cd19b8c3e16a8cc5c3380f680..e3fe19aa321d9c95491b5cdb83bc2079251a78d4 100644 (file)
--- a/ceph/src/mds/Server.cc
+++ b/ceph/src/mds/Server.cc
@@ -482,17 +482,24 @@ void Server::handle_client_session(MClientSession *m)
    m->put();
  }
  
+
+void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
+  if (!session->is_open() ||
+      !session->connection.get() ||
+      !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
+    return;
+  }
+
+  version_t seq = session->wait_for_flush(gather->new_sub());
+  mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+}
+
  void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
  {
    for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
      assert(session);
-    if (!session->is_open() ||
-       !session->connection.get() ||
-       !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
-      continue;
-    version_t seq = session->wait_for_flush(gather.new_sub());
-    mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
+    flush_session(session, &gather);
    }
  }
  
@@ -724,21 +731,40 @@ void Server::find_idle_sessions()
    //  (caps go stale, lease die)
    double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
    double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
-  while (1) {
-    Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
-    if (!session) break;
-    auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
-    if (last_cap_renew_span < cutoff) {
-      dout(20) << "laggiest active session is " << session->info.inst << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
-      break;
+
+  const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
+  if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
+    std::vector<Session*> new_stale;
+
+    for (auto session : *(sessions_p1->second)) {
+      auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
+      if (last_cap_renew_span < cutoff) {
+       dout(20) << "laggiest active session is " << session->info.inst
+                << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+       break;
+      }
+
+      if (session->last_seen > session->last_cap_renew) {
+       last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
+       if (last_cap_renew_span < cutoff) {
+         dout(20) << "laggiest active session is " << session->info.inst
+                  << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
+         continue;
+       }
+      }
+
+      dout(10) << "new stale session " << session->info.inst
+              << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
+      new_stale.push_back(session);
      }
  
-    dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
-    mds->sessionmap.set_state(session, Session::STATE_STALE);
-    mds->locker->revoke_stale_caps(session);
-    mds->locker->remove_stale_leases(session);
-    mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
-    finish_flush_session(session, session->get_push_seq());
+    for (auto session : new_stale) {
+      mds->sessionmap.set_state(session, Session::STATE_STALE);
+      mds->locker->revoke_stale_caps(session);
+      mds->locker->remove_stale_leases(session);
+      mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
+      finish_flush_session(session, session->get_push_seq());
+    }
    }
  
    // autoclose
@@ -758,11 +784,11 @@ void Server::find_idle_sessions()
  
    // Collect a list of sessions exceeding the autoclose threshold
    std::vector<Session *> to_evict;
-  const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
-  if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
+  const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
+  if (sessions_p2 == mds->sessionmap.by_state.end() || sessions_p2->second->empty()) {
      return;
    }
-  const auto &stale_sessions = sessions_p->second;
+  const auto &stale_sessions = sessions_p2->second;
    assert(stale_sessions != nullptr);
  
    for (const auto &session: *stale_sessions) {
@@ -934,7 +960,7 @@ void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
  
    // clients will get the mdsmap and discover we're reconnecting via the monitor.
    
-  reconnect_start = ceph_clock_now();
+  reconnect_start = clock::now();
    dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
    mds->sessionmap.dump();
  }
@@ -953,8 +979,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
      return;
    }
  
-  utime_t delay = ceph_clock_now();
-  delay -= reconnect_start;
+  auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
    dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
  
    bool deny = false;
@@ -1049,6 +1074,8 @@ void Server::handle_client_reconnect(MClientReconnect *m)
    }
    mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
  
+  reconnect_last_seen = clock::now();
+
    // remove from gather set
    client_reconnect_gather.erase(from);
    if (client_reconnect_gather.empty())
@@ -1070,52 +1097,70 @@ void Server::reconnect_gather_finish()
  void Server::reconnect_tick()
  {
    if (reconnect_evicting) {
-    dout(4) << "reconnect_tick: waiting for evictions" << dendl;
+    dout(7) << "reconnect_tick: waiting for evictions" << dendl;
      return;
    }
  
-  utime_t reconnect_end = reconnect_start;
-  reconnect_end += g_conf->mds_reconnect_timeout;
-  if (ceph_clock_now() >= reconnect_end &&
-      !client_reconnect_gather.empty()) {
-    dout(10) << "reconnect timed out" << dendl;
+  if (client_reconnect_gather.empty())
+    return;
  
-    // If we're doing blacklist evictions, use this to wait for them before
-    // proceeding to reconnect_gather_finish
-    MDSGatherBuilder gather(g_ceph_context);
+  auto now = clock::now();
+  auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
+  if (elapse1 < g_conf->mds_reconnect_timeout)
+    return;
  
-    for (set<client_t>::iterator p = client_reconnect_gather.begin();
-        p != client_reconnect_gather.end();
-        ++p) {
-      Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
-      assert(session);
-      dout(1) << "reconnect gave up on " << session->info.inst << dendl;
-
-      mds->clog->warn() << "evicting unresponsive client " << *session
-                        << ", after waiting " << g_conf->mds_reconnect_timeout
-                        << " seconds during MDS startup";
-
-      if (g_conf->mds_session_blacklist_on_timeout) {
-        std::stringstream ss;
-        mds->evict_client(session->info.inst.name.num(), false, true, ss,
-                          gather.new_sub());
-      } else {
-        kill_session(session, NULL);
-      }
+  vector<Session*> remaining_sessions;
+  remaining_sessions.reserve(client_reconnect_gather.size());
+  for (auto c : client_reconnect_gather) {
+    Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
+    ceph_assert(session);
+    remaining_sessions.push_back(session);
+    // client re-sends cap flush messages before the reconnect message
+    if (session->last_seen > reconnect_last_seen)
+      reconnect_last_seen = session->last_seen;
+  }
  
-      failed_reconnects++;
-    }
-    client_reconnect_gather.clear();
+  auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
+  if (elapse2 < g_conf->mds_reconnect_timeout / 2) {
+    dout(7) << "reconnect_tick: last seen " << elapse2
+            << " seconds ago, extending reconnect interval" << dendl;
+    return;
+  }
+
+  dout(7) << "reconnect timed out, " << remaining_sessions.size()
+         << " clients have not reconnected in time" << dendl;
+
+  // If we're doing blacklist evictions, use this to wait for them before
+  // proceeding to reconnect_gather_finish
+  MDSGatherBuilder gather(g_ceph_context);
+
+  for (auto session : remaining_sessions) {
+    dout(1) << "reconnect gives up on " << session->info.inst << dendl;
  
-    if (gather.has_subs()) {
-      dout(1) << "reconnect will complete once clients are evicted" << dendl;
-      gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
-            [this](int r){reconnect_gather_finish();})));
-      gather.activate();
-      reconnect_evicting = true;
+    mds->clog->warn() << "evicting unresponsive client " << *session
+                     << ", after waiting " << elapse1
+                     << " seconds during MDS startup";
+
+    if (g_conf->mds_session_blacklist_on_timeout) {
+      std::stringstream ss;
+      mds->evict_client(session->get_client().v, false, true, ss,
+                       gather.new_sub());
      } else {
-      reconnect_gather_finish();
+      kill_session(session, NULL);
      }
+
+    failed_reconnects++;
+  }
+  client_reconnect_gather.clear();
+
+  if (gather.has_subs()) {
+    dout(1) << "reconnect will complete once clients are evicted" << dendl;
+    gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
+           [this](int r){reconnect_gather_finish();})));
+    gather.activate();
+    reconnect_evicting = true;
+  } else {
+    reconnect_gather_finish();
    }
  }
  
@@ -1147,8 +1192,12 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
   * to trim some caps, and consequently unpin some inodes in the MDCache so
   * that it can trim too.
   */
-void Server::recall_client_state(void)
-{
+void Server::recall_client_state(double ratio, bool flush_client_session,
+                                 MDSGatherBuilder *gather) {
+  if (flush_client_session) {
+    assert(gather != nullptr);
+  }
+
    /* try to recall at least 80% of all caps */
    uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
    uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
@@ -1162,16 +1211,18 @@ void Server::recall_client_state(void)
    /* ratio: determine the amount of caps to recall from each client. Use
     * percentage full over the cache reservation. Cap the ratio at 80% of client
     * caps. */
-  double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
+  if (ratio < 0.0)
+    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
  
-  dout(10) << "recall_client_state " << ratio
-          << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
-          << dendl;
+  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
+           << min_caps_per_client << "-" << max_caps_per_client << dendl;
  
    set<Session*> sessions;
    mds->sessionmap.get_client_session_set(sessions);
+
    for (auto &session : sessions) {
      if (!session->is_open() ||
+        !session->connection.get() ||
         !session->info.inst.name.is_client())
        continue;
  
@@ -1185,6 +1236,9 @@ void Server::recall_client_state(void)
        MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
        m->head.max_caps = newlim;
        mds->send_message_client(m, session);
+      if (flush_client_session) {
+        flush_session(session, gather);
+      }
        session->notify_recall_sent(newlim);
      }
    }
@@ -6158,8 +6212,6 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
    if (in->is_dir()) {
      assert(straydn);
      mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
-
-    in->maybe_export_pin(true);
    }
  
    journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
@@ -7968,16 +8020,17 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
  {
    dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
  
-  CDentry::linkage_t *destdnl = destdn->get_linkage();
+  CInode *in = destdn->get_linkage()->get_inode();
+
+  inodeno_t migrated_stray;
+  if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
+    migrated_stray = in->ino();
  
    list<MDSInternalContextBase*> finished;
    if (r == 0) {
      // unfreeze+singleauth inode
      //  hmm, do i really need to delay this?
      if (mdr->more()->is_inode_exporter) {
-
-      CInode *in = destdnl->get_inode();
-
        // drop our pins
        // we exported, clear out any xlocks that we moved to another MDS
        set<SimpleLock*>::iterator i = mdr->xlocks.begin();
@@ -7994,14 +8047,14 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
        bufferlist::iterator bp = mdr->more()->inode_import.begin();
        ::decode(peer_imported, bp);
  
-      dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
-      mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
-                                            mdr->slave_to_mds, peer_imported, finished);
+      dout(10) << " finishing inode export on " << *in << dendl;
+      mdcache->migrator->finish_export_inode(in, ceph_clock_now(), mdr->slave_to_mds,
+                                            peer_imported, finished);
        mds->queue_waiters(finished);   // this includes SINGLEAUTH waiters.
  
        // unfreeze
-      assert(destdnl->get_inode()->is_frozen_inode());
-      destdnl->get_inode()->unfreeze_inode(finished);
+      assert(in->is_frozen_inode());
+      in->unfreeze_inode(finished);
      }
  
      // singleauth
@@ -8037,8 +8090,8 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
      // witness list from the master, and they failed before we tried prep again.
      if (mdr->more()->rollback_bl.length()) {
        if (mdr->more()->is_inode_exporter) {
-       dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
-       destdnl->get_inode()->abort_export();
+       dout(10) << " reversing inode export of " << *in << dendl;
+       in->abort_export();
        }
        if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
         mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
@@ -8061,6 +8114,9 @@ void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
        mdcache->request_finish(mdr);
      }
    }
+
+  if (migrated_stray && mds->is_stopping())
+    mdcache->shutdown_export_stray_finish(migrated_stray);
  }
  
  void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
diff --git a/ceph/src/mds/Server.h b/ceph/src/mds/Server.h

index db6c7713914b2386c0e7ddee0d4a4c61087577eb..4fff7ae6fe17c807e86f6831e0a46f70f61c7fe4 100644 (file)
--- a/ceph/src/mds/Server.h
+++ b/ceph/src/mds/Server.h
@@ -89,6 +89,9 @@ private:
    int failed_reconnects;
    bool reconnect_evicting;  // true if I am waiting for evictions to complete
                              // before proceeding to reconnect_gather_finish
+  time reconnect_start = time::min();
+  time reconnect_last_seen = time::min();
+  set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
  
    double cap_revoke_eviction_timeout = 0;
  
@@ -114,8 +117,6 @@ public:
    void handle_osd_map();
  
    // -- sessions and recovery --
-  utime_t  reconnect_start;
-  set<client_t> client_reconnect_gather;  // clients i need a reconnect msg from.
    bool waiting_for_reconnect(client_t c) const;
    void dump_reconnect_status(Formatter *f) const;
  
@@ -140,7 +141,8 @@ public:
    void reconnect_tick();
    void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
  
-  void recall_client_state(void);
+  void recall_client_state(double ratio, bool flush_client_session,
+                           MDSGatherBuilder *gather);
    void force_clients_readonly();
  
    // -- requests --
@@ -320,6 +322,7 @@ public:
  
  private:
    void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
+  void flush_session(Session *session, MDSGatherBuilder *gather);
  };
  
  #endif
diff --git a/ceph/src/mds/SessionMap.h b/ceph/src/mds/SessionMap.h

index 217b4ba1511ef78aa312086dc8ddb4e441f315af..eee382f4af7849033909b0610f5182c00fa91ef8 100644 (file)
--- a/ceph/src/mds/SessionMap.h
+++ b/ceph/src/mds/SessionMap.h
@@ -246,6 +246,7 @@ public:
    xlist<Capability*> caps;     // inodes with caps; front=most recently used
    xlist<ClientLease*> leases;  // metadata leases to clients
    time last_cap_renew = time::min();
+  time last_seen = time::min();
  
  public:
    version_t inc_push_seq() { return ++cap_push_seq; }
diff --git a/ceph/src/mds/StrayManager.cc b/ceph/src/mds/StrayManager.cc

index b94dd57cba292fec2c0c1fee3a4750c55c361c95..f331722ba4d3964baf2d21f889de04f472bcb568 100644 (file)
--- a/ceph/src/mds/StrayManager.cc
+++ b/ceph/src/mds/StrayManager.cc
@@ -229,8 +229,6 @@ void StrayManager::_purge_stray_purged(
  
      mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
            mds->mdlog->get_current_segment()));
-
-    logger->set(l_mdc_num_strays, num_strays);
    }
  }
  
@@ -263,9 +261,13 @@ void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *l
    }
  
    // drop inode
+  inodeno_t ino = in->ino();
    if (in->is_dirty())
      in->mark_clean();
-  in->mdcache->remove_inode(in);
+  mds->mdcache->remove_inode(in);
+
+  if (mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(ino);
  }
  
  void StrayManager::enqueue(CDentry *dn, bool trunc)
@@ -465,7 +467,7 @@ bool StrayManager::_eval_stray(CDentry *dn, bool delay)
         return false;  // not until some snaps are deleted.
        }
  
-      in->mdcache->clear_dirty_bits_for_stray(in);
+      mds->mdcache->clear_dirty_bits_for_stray(in);
  
        if (!in->remote_parents.empty()) {
         // unlink any stale remote snap dentry.
@@ -748,11 +750,15 @@ void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls)
  
    dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
  
+  in->pop_and_dirty_projected_inode(ls);
+
+  in->state_clear(CInode::STATE_PURGING);
    dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
    dn->put(CDentry::PIN_PURGING);
  
-  in->pop_and_dirty_projected_inode(ls);
-
    eval_stray(dn);
+
+  if (!dn->state_test(CDentry::STATE_PURGING) &&  mds->is_stopping())
+    mds->mdcache->shutdown_export_stray_finish(in->ino());
  }
  
diff --git a/ceph/src/mgr/DaemonServer.cc b/ceph/src/mgr/DaemonServer.cc

index 2f81e4054b7e549de4f3bda6dd2bcf0c152cb78f..87c399b802b44c6ba10f84a996e9a9d12cc2420c 100644 (file)
--- a/ceph/src/mgr/DaemonServer.cc
+++ b/ceph/src/mgr/DaemonServer.cc
@@ -773,11 +773,7 @@ bool DaemonServer::handle_command(MCommand *m)
        f.reset(Formatter::create("json-pretty"));
      // only include state from services that are in the persisted service map
      f->open_object_section("service_status");
-    ServiceMap s;
-    cluster_state.with_servicemap([&](const ServiceMap& service_map) {
-       s = service_map;
-      });
-    for (auto& p : s.services) {
+    for (auto& p : pending_service_map.services) {
        f->open_object_section(p.first.c_str());
        for (auto& q : p.second.daemons) {
         f->open_object_section(q.first.c_str());
@@ -853,6 +849,7 @@ bool DaemonServer::handle_command(MCommand *m)
        ss << "pg " << pgid << " primary osd." << acting_primary
          << " is not currently connected";
        cmdctx->reply(-EAGAIN, ss);
+      return true;
      }
      vector<pg_t> pgs = { pgid };
      for (auto& con : p->second) {
diff --git a/ceph/src/mgr/DaemonState.cc b/ceph/src/mgr/DaemonState.cc

index dc6726739d3657f800fac5c0aa5ad3d7d58e37e7..9ba7cf926d899bce1d9b4597be76f3bf7b8c3fb6 100644 (file)
--- a/ceph/src/mgr/DaemonState.cc
+++ b/ceph/src/mgr/DaemonState.cc
@@ -133,8 +133,6 @@ void DaemonPerfCounters::update(MMgrReport *report)
    for (const auto &t : report->declare_types) {
      types.insert(std::make_pair(t.path, t));
      session->declared_types.insert(t.path);
-    instances.insert(std::pair<std::string, PerfCounterInstance>(
-                     t.path, PerfCounterInstance(t.type)));
    }
    // Remove any old types
    for (const auto &t : report->undeclare_types) {
@@ -148,6 +146,13 @@ void DaemonPerfCounters::update(MMgrReport *report)
    DECODE_START(1, p);
    for (const auto &t_path : session->declared_types) {
      const auto &t = types.at(t_path);
+    auto instances_it = instances.find(t_path);
+    // Always check the instance exists, as we don't prevent yet
+    // multiple sessions from daemons with the same name, and one
+    // session clearing stats created by another on open.
+    if (instances_it == instances.end()) {
+      instances_it = instances.insert({t_path, t.type}).first;
+    }
      uint64_t val = 0;
      uint64_t avgcount = 0;
      uint64_t avgcount2 = 0;
@@ -156,9 +161,9 @@ void DaemonPerfCounters::update(MMgrReport *report)
      if (t.type & PERFCOUNTER_LONGRUNAVG) {
        ::decode(avgcount, p);
        ::decode(avgcount2, p);
-      instances.at(t_path).push_avg(now, val, avgcount);
+      instances_it->second.push_avg(now, val, avgcount);
      } else {
-      instances.at(t_path).push(now, val);
+      instances_it->second.push(now, val);
      }
    }
    DECODE_FINISH(p);
diff --git a/ceph/src/mon/AuthMonitor.cc b/ceph/src/mon/AuthMonitor.cc

index 965958b1220ecc45f6df3dcf62117fe9140ef812..338d55b085310bb8f8d494d9beeda51bd7d499d3 100644 (file)
--- a/ceph/src/mon/AuthMonitor.cc
+++ b/ceph/src/mon/AuthMonitor.cc
@@ -286,7 +286,14 @@ bool AuthMonitor::preprocess_query(MonOpRequestRef op)
    dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
    switch (m->get_type()) {
    case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
  
    case CEPH_MSG_AUTH:
      return prep_auth(op, false);
@@ -306,7 +313,14 @@ bool AuthMonitor::prepare_update(MonOpRequestRef op)
    dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
    switch (m->get_type()) {
    case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
    case MSG_MON_GLOBAL_ID:
      return prepare_global_id(op);
    case CEPH_MSG_AUTH:
@@ -556,7 +570,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
    if (prefix == "auth add" ||
        prefix == "auth del" ||
        prefix == "auth rm" ||
@@ -576,7 +590,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
  
    // entity might not be supplied, but if it is, it should be valid
    string entity_name;
-  cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
    EntityName entity;
    if (!entity_name.empty() && !entity.from_str(entity_name)) {
      ss << "invalid entity_auth " << entity_name;
@@ -585,7 +599,7 @@ bool AuthMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    if (prefix == "auth export") {
@@ -1047,10 +1061,10 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
    string entity_name;
    EntityName entity;
  
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    MonSession *session = m->get_session();
@@ -1059,14 +1073,14 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
      return true;
    }
  
-  cmd_getval(g_ceph_context, cmdmap, "caps", caps_vec);
+  cmd_getval_throws(g_ceph_context, cmdmap, "caps", caps_vec);
    if ((caps_vec.size() % 2) != 0) {
      ss << "bad capabilities request; odd number of arguments";
      err = -EINVAL;
      goto done;
    }
  
-  cmd_getval(g_ceph_context, cmdmap, "entity", entity_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "entity", entity_name);
    if (!entity_name.empty() && !entity.from_str(entity_name)) {
      ss << "bad entity name";
      err = -EINVAL;
@@ -1298,7 +1312,7 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op)
      return true;
    } else if (prefix == "fs authorize") {
      string filesystem;
-    cmd_getval(g_ceph_context, cmdmap, "filesystem", filesystem);
+    cmd_getval_throws(g_ceph_context, cmdmap, "filesystem", filesystem);
      string mds_cap_string, osd_cap_string;
      string osd_cap_wanted = "r";
  
diff --git a/ceph/src/mon/ConfigKeyService.cc b/ceph/src/mon/ConfigKeyService.cc

index e191f8367c80276d9f7f55c4d8973e269188417a..6d23ff6bac93413c24859bedaff9b418cabca835 100644 (file)
--- a/ceph/src/mon/ConfigKeyService.cc
+++ b/ceph/src/mon/ConfigKeyService.cc
@@ -189,9 +189,9 @@ bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
      return false;
    }
  
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
    string key;
-  cmd_getval(g_ceph_context, cmdmap, "key", key);
+  cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
  
    if (prefix == "config-key get") {
      ret = store_get(key, rdata);
@@ -212,7 +212,7 @@ bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
  
      bufferlist data;
      string val;
-    if (cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
        // they specified a value in the command instead of a file
        data.append(val);
      } else if (cmd->get_data_len() > 0) {
diff --git a/ceph/src/mon/FSCommands.cc b/ceph/src/mon/FSCommands.cc

index 3ce6f5565f00e320517ab1bf1a3d6241d54cb4ee..cb3e54711b87d42fd250a6e4ee65da5a3594becd 100644 (file)
--- a/ceph/src/mon/FSCommands.cc
+++ b/ceph/src/mon/FSCommands.cc
@@ -44,13 +44,13 @@ class FlagSetHandler : public FileSystemCommandHandler
        std::stringstream &ss) override
    {
      string flag_name;
-    cmd_getval(g_ceph_context, cmdmap, "flag_name", flag_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "flag_name", flag_name);
  
      string flag_val;
-    cmd_getval(g_ceph_context, cmdmap, "val", flag_val);
+    cmd_getval_throws(g_ceph_context, cmdmap, "val", flag_val);
  
      string confirm;
-    cmd_getval(g_ceph_context, cmdmap, "confirm", confirm);
+    cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm);
  
      if (flag_name == "enable_multiple") {
        bool flag_bool = false;
@@ -99,7 +99,7 @@ class FsNewHandler : public FileSystemCommandHandler
      assert(m_paxos->is_plugged());
  
      string metadata_name;
-    cmd_getval(g_ceph_context, cmdmap, "metadata", metadata_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "metadata", metadata_name);
      int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name);
      if (metadata < 0) {
        ss << "pool '" << metadata_name << "' does not exist";
@@ -107,7 +107,7 @@ class FsNewHandler : public FileSystemCommandHandler
      }
  
      string force_str;
-    cmd_getval(g_ceph_context,cmdmap, "force", force_str);
+    cmd_getval_throws(g_ceph_context,cmdmap, "force", force_str);
      bool force = (force_str == "--force");
      const pool_stat_t *stat = mon->pgservice->get_pool_stat(metadata);
      if (stat) {
@@ -120,7 +120,7 @@ class FsNewHandler : public FileSystemCommandHandler
      }
  
      string data_name;
-    cmd_getval(g_ceph_context, cmdmap, "data", data_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "data", data_name);
      int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name);
      if (data < 0) {
        ss << "pool '" << data_name << "' does not exist";
@@ -132,7 +132,7 @@ class FsNewHandler : public FileSystemCommandHandler
      }
  
      string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
      if (fs_name.empty()) {
          // Ensure fs name is not empty so that we can implement
          // commmands that refer to FS by name in future.
@@ -166,7 +166,7 @@ class FsNewHandler : public FileSystemCommandHandler
        string sure;
        if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end()
            || fs->mds_map.get_metadata_pool() == metadata)
-         && ((!cmd_getval(g_ceph_context, cmdmap, "sure", sure)
+         && ((!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure)
                || sure != "--allow-dangerous-metadata-overlay"))) {
         ss << "Filesystem '" << fs_name
            << "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this.";
@@ -230,7 +230,7 @@ public:
        std::stringstream &ss) override
    {
      std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name) || fs_name.empty()) {
        ss << "Missing filesystem name";
        return -EINVAL;
      }
@@ -242,14 +242,14 @@ public:
      }
  
      string var;
-    if (!cmd_getval(g_ceph_context, cmdmap, "var", var) || var.empty()) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "var", var) || var.empty()) {
        ss << "Invalid variable";
        return -EINVAL;
      }
      string val;
      string interr;
      int64_t n = 0;
-    if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "val", val)) {
        return -EINVAL;
      }
      // we got a string.  see if it contains an int.
@@ -290,7 +290,7 @@ public:
  
        if (enable_inline) {
         string confirm;
-       if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
+       if (!cmd_getval_throws(g_ceph_context, cmdmap, "confirm", confirm) ||
             confirm != "--yes-i-really-mean-it") {
           ss << EXPERIMENTAL_WARNING;
           return -EPERM;
@@ -455,6 +455,36 @@ public:
        {
          fs->mds_map.set_standby_count_wanted(n);
        });
+    } else if (var == "session_timeout") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_timeout((uint32_t)n);
+      });
+    } else if (var == "session_autoclose") {
+      if (interr.length()) {
+       ss << var << " requires an integer value";
+       return -EINVAL;
+      }
+      if (n < 30) {
+       ss << var << " must be at least 30s";
+       return -ERANGE;
+      }
+      fsmap.modify_filesystem(
+          fs->fscid,
+          [n](std::shared_ptr<Filesystem> fs)
+      {
+        fs->mds_map.set_session_autoclose((uint32_t)n);
+      });
      } else {
        ss << "unknown variable " << var;
        return -EINVAL;
@@ -485,10 +515,10 @@ class AddDataPoolHandler : public FileSystemCommandHandler
      assert(m_paxos->is_plugged());
  
      string poolname;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
  
      std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
          || fs_name.empty()) {
        ss << "Missing filesystem name";
        return -EINVAL;
@@ -564,7 +594,7 @@ class SetDefaultHandler : public FileSystemCommandHandler
        std::stringstream &ss) override
    {
      std::string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
      auto fs = fsmap.get_filesystem(fs_name);
      if (fs == nullptr) {
          ss << "filesystem '" << fs_name << "' does not exist";
@@ -594,7 +624,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
      // (redundant while there is only one FS, but command
      //  syntax should apply to multi-FS future)
      string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
      auto fs = fsmap.get_filesystem(fs_name);
      if (fs == nullptr) {
          // Consider absence success to make deletes idempotent
@@ -610,7 +640,7 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler
  
      // Check for confirmation flag
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      if (sure != "--yes-i-really-mean-it") {
        ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \
              " inaccessible.  Add --yes-i-really-mean-it if you are sure you wish to continue.";
@@ -655,7 +685,7 @@ class ResetFilesystemHandler : public FileSystemCommandHandler
        std::stringstream &ss) override
    {
      string fs_name;
-    cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name);
      auto fs = fsmap.get_filesystem(fs_name);
      if (fs == nullptr) {
          ss << "filesystem '" << fs_name << "' does not exist";
@@ -672,7 +702,7 @@ class ResetFilesystemHandler : public FileSystemCommandHandler
  
      // Check for confirmation flag
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      if (sure != "--yes-i-really-mean-it") {
        ss << "this is a potentially destructive operation, only for use by experts in disaster recovery.  "
          "Add --yes-i-really-mean-it if you are sure you wish to continue.";
@@ -700,10 +730,10 @@ class RemoveDataPoolHandler : public FileSystemCommandHandler
        std::stringstream &ss) override
    {
      string poolname;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
  
      std::string fs_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name)
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "fs_name", fs_name)
          || fs_name.empty()) {
        ss << "Missing filesystem name";
        return -EINVAL;
diff --git a/ceph/src/mon/LogMonitor.cc b/ceph/src/mon/LogMonitor.cc

index 4987e67da5e78772dd104e243b470d92740e3a95..fc983bceecdb476a1100aaed4af185ec683303c5 100644 (file)
--- a/ceph/src/mon/LogMonitor.cc
+++ b/ceph/src/mon/LogMonitor.cc
@@ -256,7 +256,14 @@ bool LogMonitor::preprocess_query(MonOpRequestRef op)
    dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
    switch (m->get_type()) {
    case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
  
    case MSG_LOG:
      return preprocess_log(op);
@@ -274,7 +281,14 @@ bool LogMonitor::prepare_update(MonOpRequestRef op)
    dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
    switch (m->get_type()) {
    case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
    case MSG_LOG:
      return prepare_log(op);
    default:
@@ -395,22 +409,22 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    if (prefix == "log last") {
      int64_t num = 20;
-    cmd_getval(g_ceph_context, cmdmap, "num", num);
+    cmd_getval_throws(g_ceph_context, cmdmap, "num", num);
      if (f) {
        f->open_array_section("tail");
      }
  
      std::string level_str;
      clog_type level;
-    if (cmd_getval(g_ceph_context, cmdmap, "level", level_str)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "level", level_str)) {
        level = LogEntry::str_to_level(level_str);
        if (level == CLOG_UNKNOWN) {
          ss << "Invalid severity '" << level_str << "'";
@@ -422,7 +436,7 @@ bool LogMonitor::preprocess_command(MonOpRequestRef op)
      }
  
      std::string channel;
-    if (!cmd_getval(g_ceph_context, cmdmap, "channel", channel)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "channel", channel)) {
        channel = CLOG_CHANNEL_DEFAULT;
      }
  
@@ -488,7 +502,7 @@ bool LogMonitor::prepare_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    MonSession *session = m->get_session();
    if (!session) {
@@ -498,7 +512,7 @@ bool LogMonitor::prepare_command(MonOpRequestRef op)
  
    if (prefix == "log") {
      vector<string> logtext;
-    cmd_getval(g_ceph_context, cmdmap, "logtext", logtext);
+    cmd_getval_throws(g_ceph_context, cmdmap, "logtext", logtext);
      LogEntry le;
      le.who = m->get_orig_source_inst();
      le.name = session->entity_name;
diff --git a/ceph/src/mon/MDSMonitor.cc b/ceph/src/mon/MDSMonitor.cc

index 1ad416170eb57b20856084d022d11a81a9e9ae51..7be5f14f77a65b808a1103974a4829587eb50b46 100644 (file)
--- a/ceph/src/mon/MDSMonitor.cc
+++ b/ceph/src/mon/MDSMonitor.cc
@@ -309,7 +309,14 @@ bool MDSMonitor::preprocess_query(MonOpRequestRef op)
      return preprocess_beacon(op);
      
    case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
  
    case MSG_MDS_OFFLOAD_TARGETS:
      return preprocess_offload_targets(op);
@@ -519,7 +526,14 @@ bool MDSMonitor::prepare_update(MonOpRequestRef op)
      return prepare_beacon(op);
  
    case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
  
    case MSG_MDS_OFFLOAD_TARGETS:
      return prepare_offload_targets(op);
@@ -686,7 +700,9 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
          });
      }
  
-    if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
+    if (info.state == MDSMap::STATE_STOPPING &&
+        state != MDSMap::STATE_STOPPING &&
+        state != MDSMap::STATE_STOPPED) {
        // we can't transition to any other states from STOPPING
        dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
                << dendl;
diff --git a/ceph/src/mon/MgrMonitor.cc b/ceph/src/mon/MgrMonitor.cc

index fe5bf828ac6fd86609ce4b299c81b16f45157234..17a59add3e9fb0a0723a9169b81b7c4265c2890a 100644 (file)
--- a/ceph/src/mon/MgrMonitor.cc
+++ b/ceph/src/mon/MgrMonitor.cc
@@ -231,7 +231,15 @@ bool MgrMonitor::preprocess_query(MonOpRequestRef op)
      case MSG_MGR_BEACON:
        return preprocess_beacon(op);
      case MSG_MON_COMMAND:
-      return preprocess_command(op);
+      try {
+       return preprocess_command(op);
+      }
+      catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
+
      default:
        mon->no_reply(op);
        derr << "Unhandled message type " << m->get_type() << dendl;
@@ -247,7 +255,14 @@ bool MgrMonitor::prepare_update(MonOpRequestRef op)
        return prepare_beacon(op);
  
      case MSG_MON_COMMAND:
-      return prepare_command(op);
+      try {
+       return prepare_command(op);
+      }
+      catch (const bad_cmd_get& e) {
+       bufferlist bl;
+       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+       return true;
+      }
  
      default:
        mon->no_reply(op);
@@ -723,16 +738,16 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
    int r = 0;
  
    if (prefix == "mgr dump") {
      int64_t epoch = 0;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
      if (epoch == (int64_t)map.get_epoch()) {
        f->dump_object("mgrmap", map);
      } else {
@@ -776,14 +791,14 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
      f->flush(rdata);
    } else if (prefix == "mgr metadata") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "id", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "id", name);
      if (name.size() > 0 && !map.have_name(name)) {
        ss << "mgr." << name << " does not exist";
        r = -ENOENT;
        goto reply;
      }
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      if (name.size()) {
        f->open_object_section("mgr_metadata");
@@ -822,7 +837,7 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
      if (!f)
        f.reset(Formatter::create("json-pretty"));
      string field;
-    cmd_getval(g_ceph_context, cmdmap, "property", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
      count_metadata(field, f.get());
      f->flush(rdata);
      r = 0;
@@ -858,17 +873,17 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
    }
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    int r = 0;
  
    if (prefix == "mgr fail") {
      string who;
-    cmd_getval(g_ceph_context, cmdmap, "who", who);
+    cmd_getval_throws(g_ceph_context, cmdmap, "who", who);
  
      std::string err;
      uint64_t gid = strict_strtol(who.c_str(), 10, &err);
@@ -910,13 +925,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
      }
    } else if (prefix == "mgr module enable") {
      string module;
-    cmd_getval(g_ceph_context, cmdmap, "module", module);
+    cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
      if (module.empty()) {
        r = -EINVAL;
        goto out;
      }
      string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
      if (!pending_map.all_support_module(module) &&
         force != "--force") {
        ss << "all mgr daemons do not support module '" << module << "', pass "
@@ -927,7 +942,7 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
      pending_map.modules.insert(module);
    } else if (prefix == "mgr module disable") {
      string module;
-    cmd_getval(g_ceph_context, cmdmap, "module", module);
+    cmd_getval_throws(g_ceph_context, cmdmap, "module", module);
      if (module.empty()) {
        r = -EINVAL;
        goto out;
diff --git a/ceph/src/mon/MonCap.cc b/ceph/src/mon/MonCap.cc

index c93644f6811e9f08065d29be961f540e46bdb37c..95cb8bf71738949465cb7146dfa5d0aaac9a5e20 100644 (file)
--- a/ceph/src/mon/MonCap.cc
+++ b/ceph/src/mon/MonCap.cc
@@ -353,6 +353,12 @@ mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct,
      }
      return MON_CAP_ALL;
    }
+  // we don't allow config-key service to be accessed with blanket caps other
+  // than '*' (i.e., 'any'), and that should have been checked by the caller
+  // via 'is_allow_all()'.
+  if (s == "config-key") {
+    return 0;
+  }
    return allow;
  }
  
@@ -487,7 +493,7 @@ struct MonCapParser : qi::grammar<Iterator, MonCap()>
      quoted_string %=
        lexeme['"' >> +(char_ - '"') >> '"'] | 
        lexeme['\'' >> +(char_ - '\'') >> '\''];
-    unquoted_word %= +char_("a-zA-Z0-9_.-");
+    unquoted_word %= +char_("a-zA-Z0-9_./-");
      str %= quoted_string | unquoted_word;
  
      spaces = +(lit(' ') | lit('\n') | lit('\t'));
diff --git a/ceph/src/mon/MonCommands.h b/ceph/src/mon/MonCommands.h

index f5bb5c5bbf54844d2ed306d4153b21ff40e19315..6890af25a0bc5b0e33ca4c74513e07499db0890d 100644 (file)
--- a/ceph/src/mon/MonCommands.h
+++ b/ceph/src/mon/MonCommands.h
@@ -404,10 +404,10 @@ COMMAND("fs set " \
         "name=fs_name,type=CephString " \
         "name=var,type=CephChoices,strings=max_mds|max_file_size"
          "|allow_new_snaps|inline_data|cluster_down|allow_multimds|allow_dirfrags|balancer" \
-        "|standby_count_wanted " \
+        "|standby_count_wanted|session_timeout|session_autoclose " \
         "name=val,type=CephString "                                     \
         "name=confirm,type=CephString,req=false",                       \
-       "set mds parameter <var> to <val>", "mds", "rw", "cli,rest")
+       "set fs parameter <var> to <val>", "mds", "rw", "cli,rest")
  COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple "
          "name=val,type=CephString " \
         "name=confirm,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
@@ -742,7 +742,7 @@ COMMAND("osd erasure-code-profile ls", \
         "list all erasure code profiles", \
         "osd", "r", "cli,rest")
  COMMAND("osd set " \
-       "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds " \
+       "name=key,type=CephChoices,strings=full|pause|noup|nodown|noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|notieragent|sortbitwise|recovery_deletes|require_jewel_osds|require_kraken_osds||pglog_hardlimit " \
         "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
         "set <key>", "osd", "rw", "cli,rest")
  COMMAND("osd unset " \
diff --git a/ceph/src/mon/Monitor.cc b/ceph/src/mon/Monitor.cc

index 9960dea46df40bde7c435f015618bb92988983cc..76c3c77081b8845f50856acb82d464803ff95489 100644 (file)
--- a/ceph/src/mon/Monitor.cc
+++ b/ceph/src/mon/Monitor.cc
@@ -889,7 +889,9 @@ void Monitor::shutdown()
  
    state = STATE_SHUTDOWN;
  
+  lock.Unlock();
    g_conf->remove_observer(this);
+  lock.Lock();
  
    if (admin_hook) {
      AdminSocket* admin_socket = cct->get_admin_socket();
@@ -929,6 +931,16 @@ void Monitor::shutdown()
  
    remove_all_sessions();
  
+  log_client.shutdown();
+
+  // unlock before msgr shutdown...
+  lock.Unlock();
+
+  // shutdown messenger before removing logger from perfcounter collection, 
+  // otherwise _ms_dispatch() will try to update deleted logger
+  messenger->shutdown();
+  mgr_messenger->shutdown();
+
    if (logger) {
      cct->get_perfcounters_collection()->remove(logger);
      delete logger;
@@ -940,14 +952,6 @@ void Monitor::shutdown()
      delete cluster_logger;
      cluster_logger = NULL;
    }
-
-  log_client.shutdown();
-
-  // unlock before msgr shutdown...
-  lock.Unlock();
-
-  messenger->shutdown();  // last thing!  ceph_mon.cc will delete mon.
-  mgr_messenger->shutdown();
  }
  
  void Monitor::wait_for_paxos_write()
diff --git a/ceph/src/mon/MonmapMonitor.cc b/ceph/src/mon/MonmapMonitor.cc

index 8abcf81312e9d73aae9014f3380cfb28eb82df57..78f4a82001f72b83eefb636496977c7c6964b31b 100644 (file)
--- a/ceph/src/mon/MonmapMonitor.cc
+++ b/ceph/src/mon/MonmapMonitor.cc
@@ -201,7 +201,14 @@ bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
    switch (m->get_type()) {
      // READs
    case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
    case MSG_MON_JOIN:
      return preprocess_join(op);
    default:
@@ -238,7 +245,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    MonSession *session = m->get_session();
    if (!session) {
@@ -247,7 +254,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    if (prefix == "mon stat") {
@@ -264,7 +271,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
  
      epoch_t epoch;
      int64_t epochnum;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
      epoch = epochnum;
  
      MonMap *p = mon->monmap;
@@ -315,7 +322,7 @@ bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
     
      bool list_with_value = false;
      string with_value;
-    if (cmd_getval(g_ceph_context, cmdmap, "with_value", with_value) &&
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "with_value", with_value) &&
          with_value == "--with-value") {
        list_with_value = true;
      }
@@ -403,7 +410,14 @@ bool MonmapMonitor::prepare_update(MonOpRequestRef op)
    
    switch (m->get_type()) {
    case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
    case MSG_MON_JOIN:
      return prepare_join(op);
    default:
@@ -428,7 +442,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    MonSession *session = m->get_session();
    if (!session) {
@@ -489,9 +503,9 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
    bool propose = false;
    if (prefix == "mon add") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      string addrstr;
-    cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
      entity_addr_t addr;
      bufferlist rdata;
  
@@ -559,7 +573,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
    } else if (prefix == "mon remove" ||
               prefix == "mon rm") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      if (!monmap.contains(name)) {
        err = 0;
        ss << "mon." << name << " does not exist or has already been removed";
@@ -625,7 +639,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
       * 'mon flag set/unset'.
       */
      string feature_name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "feature_name", feature_name)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "feature_name", feature_name)) {
        ss << "missing required feature name";
        err = -EINVAL;
        goto reply;
@@ -640,7 +654,7 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
      }
  
      string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
          sure != "--yes-i-really-mean-it") {
        ss << "please specify '--yes-i-really-mean-it' if you "
           << "really, **really** want to set feature '"
@@ -674,7 +688,6 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
              << "persistent = " << pending_map.persistent_features
              // output optional nevertheless, for auditing purposes.
              << ", optional = " << pending_map.optional_features << dendl;
-    
    } else {
      ss << "unknown command " << prefix;
      err = -EINVAL;
diff --git a/ceph/src/mon/OSDMonitor.cc b/ceph/src/mon/OSDMonitor.cc

index 8376a40668515147de23329a842182d8cd261697..9ce27c1362932f3c7b58d11c16e720b9feae7511 100644 (file)
--- a/ceph/src/mon/OSDMonitor.cc
+++ b/ceph/src/mon/OSDMonitor.cc
@@ -1006,7 +1006,8 @@ void OSDMonitor::prime_pg_temp(
    int next_up_primary, next_acting_primary;
    next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
                             &next_acting, &next_acting_primary);
-  if (acting == next_acting && next_up != next_acting)
+  if (acting == next_acting &&
+      !(up != acting && next_up == next_acting))
      return;  // no change since last epoch
  
    if (acting.empty())
@@ -1723,7 +1724,14 @@ bool OSDMonitor::preprocess_query(MonOpRequestRef op)
    switch (m->get_type()) {
      // READs
    case MSG_MON_COMMAND:
-    return preprocess_command(op);
+    try {
+      return preprocess_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
    case CEPH_MSG_MON_GET_OSDMAP:
      return preprocess_get_osdmap(op);
  
@@ -1783,7 +1791,14 @@ bool OSDMonitor::prepare_update(MonOpRequestRef op)
      return prepare_beacon(op);
  
    case MSG_MON_COMMAND:
-    return prepare_command(op);
+    try {
+      return prepare_command(op);
+    }
+    catch (const bad_cmd_get& e) {
+      bufferlist bl;
+      mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
+      return true;
+    }
  
    case CEPH_MSG_POOLOP:
      return prepare_pool_op(op);
@@ -2502,6 +2517,17 @@ bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
      }
    }
  
+  // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+  // we are reusing a jewel feature bit that was retired in luminous.
+  if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+      osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
+      !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
+    mon->clog->info() << "disallowing boot of OSD "
+                     << m->get_orig_source_inst()
+                     << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
+    goto ignore;
+  }
+
    // already booted?
    if (osdmap.is_up(from) &&
        osdmap.get_inst(from) == m->get_orig_source_inst() &&
@@ -3064,6 +3090,7 @@ bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
  
    // check privilege, ignore if failed
    MonSession *session = m->get_session();
+  mon->no_reply(op);
    if (!session)
      goto ignore;
    if (!session->caps.is_capable(
@@ -4260,10 +4287,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
    }
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    if (prefix == "osd stat") {
@@ -4288,7 +4315,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  
      epoch_t epoch = 0;
      int64_t epochnum;
-    cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
+    cmd_getval_throws(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
      epoch = epochnum;
      
      bufferlist osdmap_bl;
@@ -4352,7 +4379,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
        rdata.append(ds);
      } else if (prefix == "osd tree") {
        vector<string> states;
-      cmd_getval(g_ceph_context, cmdmap, "states", states);
+      cmd_getval_throws(g_ceph_context, cmdmap, "states", states);
        unsigned filter = 0;
        for (auto& s : states) {
         if (s == "up") {
@@ -4404,7 +4431,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
        ss << p->get_crush_version();
      } else if (prefix == "osd ls-tree") {
        string bucket_name;
-      cmd_getval(g_ceph_context, cmdmap, "name", bucket_name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", bucket_name);
        set<int> osds;
        r = p->get_osds_by_bucket_name(bucket_name, &osds);
        if (r == -ENOENT) {
@@ -4466,7 +4493,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      goto reply;
    } else if (prefix  == "osd find") {
      int64_t osd;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
        ss << "unable to parse osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        r = -EINVAL;
@@ -4478,11 +4505,12 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
        goto reply;
      }
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      f->open_object_section("osd_location");
      f->dump_int("osd", osd);
      f->dump_stream("ip") << osdmap.get_addr(osd);
+    f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
      f->open_object_section("crush_location");
      map<string,string> loc = osdmap.crush->get_full_location(osd);
      for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
@@ -4493,7 +4521,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
    } else if (prefix == "osd metadata") {
      int64_t osd = -1;
      if (cmd_vartype_stringify(cmdmap["id"]).size() &&
-        !cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+        !cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
        ss << "unable to parse osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        r = -EINVAL;
@@ -4505,7 +4533,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
        goto reply;
      }
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      if (osd >= 0) {
        f->open_object_section("osd_metadata");
@@ -4546,15 +4574,15 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      if (!f)
        f.reset(Formatter::create("json-pretty"));
      string field;
-    cmd_getval(g_ceph_context, cmdmap, "property", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "property", field);
      count_metadata(field, f.get());
      f->flush(rdata);
      r = 0;
    } else if (prefix == "osd map") {
      string poolstr, objstr, namespacestr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
-    cmd_getval(g_ceph_context, cmdmap, "object", objstr);
-    cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "object", objstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "nspace", namespacestr);
  
      int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
      if (pool < 0) {
@@ -4608,7 +4636,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
    } else if (prefix == "pg map") {
      pg_t pgid;
      string pgidstr;
-    cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
      if (!pgid.parse(pgidstr.c_str())) {
        ss << "invalid pgid '" << pgidstr << "'";
        r = -EINVAL;
@@ -4685,7 +4713,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      }
    } else if (prefix == "osd lspools") {
      int64_t auid;
-    cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
+    cmd_getval_throws(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
      if (f)
        f->open_array_section("pools");
      for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
@@ -4736,7 +4764,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  
    } else if (prefix == "osd pool ls") {
      string detail;
-    cmd_getval(g_ceph_context, cmdmap, "detail", detail);
+    cmd_getval_throws(g_ceph_context, cmdmap, "detail", detail);
      if (!f && detail == "detail") {
        ostringstream ss;
        osdmap.print_pools(ss);
@@ -4768,7 +4796,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  
    } else if (prefix == "osd crush get-tunable") {
      string tunable;
-    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
      ostringstream rss;
      if (f)
        f->open_object_section("tunable");
@@ -4791,7 +4819,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  
    } else if (prefix == "osd pool get") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
      if (pool < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -4801,7 +4829,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
  
      const pg_pool_t *p = osdmap.get_pg_pool(pool);
      string var;
-    cmd_getval(g_ceph_context, cmdmap, "var", var);
+    cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
  
      typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
      const choices_map_t ALL_CHOICES = {
@@ -5241,7 +5269,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
                                            osdmap, f.get(), &ss, &rdata);
    } else if (prefix == "osd pool get-quota") {
      string pool_name;
-    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
  
      int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
      if (poolid < 0) {
@@ -5292,7 +5320,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      }
    } else if (prefix == "osd crush rule ls-by-class") {
      string class_name;
-    cmd_getval(g_ceph_context, cmdmap, "class", class_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", class_name);
      if (class_name.empty()) {
        ss << "no class specified";
        r = -EINVAL;
@@ -5320,9 +5348,9 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      }
    } else if (prefix == "osd crush rule dump") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      if (name == "") {
        f->open_array_section("rules");
@@ -5343,7 +5371,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      rdata.append(rs.str());
    } else if (prefix == "osd crush dump") {
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      f->open_object_section("crush_map");
      osdmap.crush->dump(f.get());
@@ -5354,7 +5382,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      rdata.append(rs.str());
    } else if (prefix == "osd crush show-tunables") {
      string format;
-    cmd_getval(g_ceph_context, cmdmap, "format", format);
+    cmd_getval_throws(g_ceph_context, cmdmap, "format", format);
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
      f->open_object_section("crush_map_tunables");
      osdmap.crush->dump_tunables(f.get());
@@ -5365,7 +5393,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      rdata.append(rs.str());
    } else if (prefix == "osd crush tree") {
      string shadow;
-    cmd_getval(g_ceph_context, cmdmap, "shadow", shadow);
+    cmd_getval_throws(g_ceph_context, cmdmap, "shadow", shadow);
      bool show_shadow = shadow == "--show-shadow";
      boost::scoped_ptr<Formatter> f(Formatter::create(format));
      if (f) {
@@ -5386,7 +5414,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      }
    } else if (prefix == "osd crush ls") {
      string name;
-    if (!cmd_getval(g_ceph_context, cmdmap, "node", name)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "node", name)) {
        ss << "no node specified";
        r = -EINVAL;
        goto reply;
@@ -5430,7 +5458,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      f->flush(rdata);
    } else if (prefix == "osd crush class ls-osd") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "class", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", name);
      set<int> osds;
      osdmap.crush->get_devices_by_class(name, &osds);
      if (f) {
@@ -5499,7 +5527,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      f->flush(rdata);
    } else if (prefix == "osd erasure-code-profile get") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      if (!osdmap.has_erasure_code_profile(name)) {
        ss << "unknown erasure code profile '" << name << "'";
        r = -ENOENT;
@@ -5527,11 +5555,11 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
      boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
                                                       "json-pretty"));
      string pool_name;
-    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
      string app;
-    cmd_getval(g_ceph_context, cmdmap, "app", app);
+    cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
      string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
  
      if (pool_name.empty()) {
        // all
@@ -6413,14 +6441,14 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
                                           stringstream& ss)
  {
    string poolstr;
-  cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+  cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
    int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
    if (pool < 0) {
      ss << "unrecognized pool '" << poolstr << "'";
      return -ENOENT;
    }
    string var;
-  cmd_getval(g_ceph_context, cmdmap, "var", var);
+  cmd_getval_throws(g_ceph_context, cmdmap, "var", var);
  
    pg_pool_t p = *osdmap.get_pg_pool(pool);
    if (pending_inc.new_pools.count(pool))
@@ -6436,17 +6464,12 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
    int64_t n = 0;
    double f = 0;
    int64_t uf = 0;  // micro-f
-  if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
-    // wasn't a string; maybe an older mon forwarded json with an int?
-    if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
-      return -EINVAL;  // no value!
-  } else {
-    // we got a string.  see if it contains an int.
-    n = strict_strtoll(val.c_str(), 10, &interr);
-    // or a float
-    f = strict_strtod(val.c_str(), &floaterr);
-    uf = llrintl(f * (double)1000000.0);
-  }
+  cmd_getval(g_ceph_context, cmdmap, "val", val);
+
+  // parse string as both int and float; different fields use different types.
+  n = strict_strtoll(val.c_str(), 10, &interr);
+  f = strict_strtod(val.c_str(), &floaterr);
+  uf = llrintl(f * (double)1000000.0);
  
    if (!p.is_tier() &&
        (var == "hit_set_type" || var == "hit_set_period" ||
@@ -6555,7 +6578,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
        return r;
      }
      string force;
-    cmd_getval(g_ceph_context,cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context,cmdmap, "force", force);
      if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
         force != "--yes-i-really-mean-it") {
        ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
@@ -6622,7 +6645,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
    } else if (var == "hashpspool") {
      uint64_t flag = pg_pool_t::get_flag_by_name(var);
      string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
      if (force != "--yes-i-really-mean-it") {
        ss << "are you SURE?  this will remap all placement groups in this pool,"
             " this triggers large data movement,"
@@ -6918,7 +6941,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
                                                   stringstream& ss)
  {
    string pool_name;
-  cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+  cmd_getval_throws(g_ceph_context, cmdmap, "pool", pool_name);
    int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
    if (pool < 0) {
      ss << "unrecognized pool '" << pool_name << "'";
@@ -6931,7 +6954,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
    }
  
    string app;
-  cmd_getval(g_ceph_context, cmdmap, "app", app);
+  cmd_getval_throws(g_ceph_context, cmdmap, "app", app);
    bool app_exists = (p.application_metadata.count(app) > 0);
  
    if (boost::algorithm::ends_with(prefix, "enable")) {
@@ -6946,7 +6969,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
      }
  
      string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
  
      if (!app_exists && !p.application_metadata.empty() &&
          force != "--yes-i-really-mean-it") {
@@ -6974,7 +6997,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
  
    } else if (boost::algorithm::ends_with(prefix, "disable")) {
      string force;
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
  
      if (force != "--yes-i-really-mean-it") {
        ss << "Are you SURE? Disabling an application within a pool might result "
@@ -7005,7 +7028,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
      }
  
      string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
  
      if (key.empty()) {
        ss << "key must be provided";
@@ -7027,7 +7050,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
      }
  
      string value;
-    cmd_getval(g_ceph_context, cmdmap, "value", value);
+    cmd_getval_throws(g_ceph_context, cmdmap, "value", value);
      if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
        ss << "value '" << value << "' too long; max length "
           << MAX_POOL_APPLICATION_LENGTH;
@@ -7045,7 +7068,7 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
      }
  
      string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
      auto it = p.application_metadata[app].find(key);
      if (it == p.application_metadata[app].end()) {
        ss << "application '" << app << "' on pool '" << pool_name
@@ -7360,7 +7383,7 @@ int OSDMonitor::prepare_command_osd_new(
     * If `id` is specified, and the osd has been previously marked
     * as destroyed, then the `id` will be reused.
     */
-  if (!cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+  if (!cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
      ss << "requires the OSD's UUID to be specified.";
      return -EINVAL;
    } else if (!uuid.parse(uuidstr.c_str())) {
@@ -7368,7 +7391,7 @@ int OSDMonitor::prepare_command_osd_new(
      return -EINVAL;
    }
  
-  if (cmd_getval(g_ceph_context, cmdmap, "id", id) &&
+  if (cmd_getval_throws(g_ceph_context, cmdmap, "id", id) &&
        (id < 0)) {
      ss << "invalid OSD id; must be greater or equal than zero.";
      return -EINVAL;
@@ -7614,7 +7637,7 @@ static int parse_reweights(CephContext *cct,
                            map<int32_t, uint32_t>* weights)
  {
    string weights_str;
-  if (!cmd_getval(g_ceph_context, cmdmap, "weights", weights_str)) {
+  if (!cmd_getval_throws(g_ceph_context, cmdmap, "weights", weights_str)) {
      return -EINVAL;
    }
    std::replace(begin(weights_str), end(weights_str), '\'', '"');
@@ -7797,11 +7820,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    int err = 0;
  
    string format;
-  cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+  cmd_getval_throws(g_ceph_context, cmdmap, "format", format, string("plain"));
    boost::scoped_ptr<Formatter> f(Formatter::create(format));
  
    string prefix;
-  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+  cmd_getval_throws(g_ceph_context, cmdmap, "prefix", prefix);
  
    int64_t osdid;
    string name;
@@ -7809,7 +7832,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    if (prefix != "osd pg-temp" &&
        prefix != "osd pg-upmap" &&
        prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
-    osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
+    osdid_present = cmd_getval_throws(g_ceph_context, cmdmap, "id", osdid);
    }
    if (osdid_present) {
      ostringstream oss;
@@ -7868,7 +7891,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
    
      int64_t prior_version = 0;
-    if (cmd_getval(g_ceph_context, cmdmap, "prior_version", prior_version)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "prior_version", prior_version)) {
        if (prior_version == osdmap.get_crush_version() - 1) {
         // see if we are a resend of the last update.  this is imperfect
         // (multiple racing updaters may not both get reliable success)
@@ -7963,14 +7986,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      string device_class;
-    if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class)) {
        err = -EINVAL; // no value!
        goto reply;
      }
  
      bool stop = false;
      vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
      CrushWrapper newcrush;
      _get_pending_crush(newcrush);
      set<int> updated;
@@ -8046,7 +8069,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
   } else if (prefix == "osd crush rm-device-class") {
      bool stop = false;
      vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
      CrushWrapper newcrush;
      _get_pending_crush(newcrush);
      set<int> updated;
@@ -8106,11 +8129,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
    } else if (prefix == "osd crush class rename") {
      string srcname, dstname;
-    if (!cmd_getval(g_ceph_context, cmdmap, "srcname", srcname)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname)) {
        err = -EINVAL;
        goto reply;
      }
-    if (!cmd_getval(g_ceph_context, cmdmap, "dstname", dstname)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname)) {
        err = -EINVAL;
        goto reply;
      }
@@ -8139,8 +8162,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd crush add-bucket") {
      // os crush add-bucket <name> <type>
      string name, typestr;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "type", typestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", typestr);
  
      if (!_have_pending_crush() &&
         _get_stable_crush().name_exists(name)) {
@@ -8187,8 +8210,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      goto update;
    } else if (prefix == "osd crush rename-bucket") {
      string srcname, dstname;
-    cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
-    cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
  
      err = crush_rename_bucket(srcname, dstname, &ss);
      if (err == -EALREADY) // equivalent to success for idempotency
@@ -8220,14 +8243,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
         goto reply;
        }
        string poolname, mode;
-      cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+      cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
        pool = osdmap.lookup_pg_pool_name(poolname.c_str());
        if (pool < 0) {
         ss << "pool '" << poolname << "' not found";
         err = -ENOENT;
         goto reply;
        }
-      cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+      cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
        if (mode != "flat" && mode != "positional") {
         ss << "unrecognized weight-set mode '" << mode << "'";
         err = -EINVAL;
@@ -8250,7 +8273,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      int64_t pool;
      if (prefix == "osd crush weight-set rm") {
        string poolname;
-      cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
+      cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
        pool = osdmap.lookup_pg_pool_name(poolname.c_str());
        if (pool < 0) {
         ss << "pool '" << poolname << "' not found";
@@ -8269,9 +8292,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
              prefix == "osd crush weight-set reweight-compat") {
      string poolname, item;
      vector<double> weight;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolname);
-    cmd_getval(g_ceph_context, cmdmap, "item", item);
-    cmd_getval(g_ceph_context, cmdmap, "weight", weight);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "item", item);
+    cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight);
      CrushWrapper newcrush;
      _get_pending_crush(newcrush);
      int64_t pool;
@@ -8333,7 +8356,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      double weight;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
        ss << "unable to parse weight value '"
           << cmd_vartype_stringify(cmdmap["weight"]) << "'";
        err = -EINVAL;
@@ -8342,7 +8365,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      string args;
      vector<string> argvec;
-    cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
      map<string,string> loc;
      CrushWrapper::parse_loc_map(argvec, &loc);
  
@@ -8401,7 +8424,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        }
  
        double weight;
-      if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
+      if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", weight)) {
          ss << "unable to parse weight value '"
             << cmd_vartype_stringify(cmdmap["weight"]) << "'";
          err = -EINVAL;
@@ -8410,7 +8433,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
        string args;
        vector<string> argvec;
-      cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+      cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
        map<string,string> loc;
        CrushWrapper::parse_loc_map(argvec, &loc);
  
@@ -8444,8 +8467,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
        string args;
        vector<string> argvec;
-      cmd_getval(g_ceph_context, cmdmap, "name", name);
-      cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
        map<string,string> loc;
        CrushWrapper::parse_loc_map(argvec, &loc);
  
@@ -8482,9 +8505,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      } while (false);
    } else if (prefix == "osd crush swap-bucket") {
      string source, dest, force;
-    cmd_getval(g_ceph_context, cmdmap, "source", source);
-    cmd_getval(g_ceph_context, cmdmap, "dest", dest);
-    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    cmd_getval_throws(g_ceph_context, cmdmap, "source", source);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dest", dest);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force", force);
      CrushWrapper newcrush;
      _get_pending_crush(newcrush);
      if (!newcrush.name_exists(source)) {
@@ -8530,9 +8553,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd crush link") {
      // osd crush link <name> <loc1> [<loc2> ...]
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      vector<string> argvec;
-    cmd_getval(g_ceph_context, cmdmap, "args", argvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "args", argvec);
      map<string,string> loc;
      CrushWrapper::parse_loc_map(argvec, &loc);
  
@@ -8593,7 +8616,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        _get_pending_crush(newcrush);
  
        string name;
-      cmd_getval(g_ceph_context, cmdmap, "name", name);
+      cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
  
        if (!osdmap.crush->name_exists(name)) {
         err = 0;
@@ -8613,7 +8636,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
        bool unlink_only = prefix == "osd crush unlink";
        string ancestor_str;
-      if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
+      if (cmd_getval_throws(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
         if (!newcrush.name_exists(ancestor_str)) {
           err = -ENOENT;
           ss << "ancestor item '" << ancestor_str
@@ -8660,7 +8683,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      _get_pending_crush(newcrush);
  
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      if (!newcrush.name_exists(name)) {
        err = -ENOENT;
        ss << "device '" << name << "' does not appear in the crush map";
@@ -8674,7 +8697,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
        ss << "unable to parse weight value '"
          << cmd_vartype_stringify(cmdmap["weight"]) << "'";
        err = -EINVAL;
@@ -8698,7 +8721,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      _get_pending_crush(newcrush);
  
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      if (!newcrush.name_exists(name)) {
        err = -ENOENT;
        ss << "device '" << name << "' does not appear in the crush map";
@@ -8712,7 +8735,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
        ss << "unable to parse weight value '"
          << cmd_vartype_stringify(cmdmap["weight"]) << "'";
        err = -EINVAL;
@@ -8736,7 +8759,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      err = 0;
      string profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
      if (profile == "legacy" || profile == "argonaut") {
        newcrush.set_tunables_legacy();
      } else if (profile == "bobtail") {
@@ -8775,10 +8798,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      err = 0;
      string tunable;
-    cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tunable", tunable);
  
      int64_t value = -1;
-    if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "value", value)) {
        err = -EINVAL;
        ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
        goto reply;
@@ -8812,10 +8835,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd crush rule create-simple") {
      string name, root, type, mode;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "root", root);
-    cmd_getval(g_ceph_context, cmdmap, "type", type);
-    cmd_getval(g_ceph_context, cmdmap, "mode", mode);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+    cmd_getval_throws(g_ceph_context, cmdmap, "mode", mode);
      if (mode == "")
        mode = "firstn";
  
@@ -8853,10 +8876,10 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd crush rule create-replicated") {
      string name, root, type, device_class;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
-    cmd_getval(g_ceph_context, cmdmap, "root", root);
-    cmd_getval(g_ceph_context, cmdmap, "type", type);
-    cmd_getval(g_ceph_context, cmdmap, "class", device_class);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "root", root);
+    cmd_getval_throws(g_ceph_context, cmdmap, "type", type);
+    cmd_getval_throws(g_ceph_context, cmdmap, "class", device_class);
  
      if (!device_class.empty()) {
        if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
@@ -8902,7 +8925,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd erasure-code-profile rm") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
  
      if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
        goto wait;
@@ -8933,9 +8956,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd erasure-code-profile set") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      vector<string> profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
      bool force;
      if (profile.size() > 0 && profile.back() == "--force") {
        profile.pop_back();
@@ -9015,9 +9038,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err)
        goto reply;
      string name, poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
      string profile;
-    cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "profile", profile);
      if (profile == "")
        profile = "default";
      if (profile == "default") {
@@ -9071,7 +9094,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd crush rule rm") {
      string name;
-    cmd_getval(g_ceph_context, cmdmap, "name", name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "name", name);
  
      if (!osdmap.crush->rule_exists(name)) {
        ss << "rule " << name << " does not exist";
@@ -9115,8 +9138,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd crush rule rename") {
      string srcname;
      string dstname;
-    cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
-    cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcname", srcname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "dstname", dstname);
      if (srcname.empty() || dstname.empty()) {
        ss << "must specify both source rule name and destination rule name";
        err = -EINVAL;
@@ -9153,7 +9176,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd setmaxosd") {
      int64_t newmax;
-    if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "newmax", newmax)) {
        ss << "unable to parse 'newmax' value '"
           << cmd_vartype_stringify(cmdmap["newmax"]) << "'";
        err = -EINVAL;
@@ -9201,7 +9224,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      double n;
-    if (!cmd_getval(g_ceph_context, cmdmap, "ratio", n)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "ratio", n)) {
        ss << "unable to parse 'ratio' value '"
           << cmd_vartype_stringify(cmdmap["ratio"]) << "'";
        err = -EINVAL;
@@ -9226,7 +9249,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string v;
-    cmd_getval(g_ceph_context, cmdmap, "version", v);
+    cmd_getval_throws(g_ceph_context, cmdmap, "version", v);
      int vno = ceph_release_from_name(v.c_str());
      if (vno <= 0) {
        ss << "version " << v << " is not recognized";
@@ -9247,7 +9270,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      if (sure != "--yes-i-really-mean-it") {
        FeatureMap m;
        mon->get_combined_feature_map(&m);
@@ -9299,9 +9322,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd set") {
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
      if (key == "full")
        return prepare_set_flag(op, CEPH_OSDMAP_FULL);
      else if (key == "pause")
@@ -9356,6 +9379,24 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
         err = -EPERM;
         goto reply;
        }
+    } else if (key == "pglog_hardlimit") {
+      if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
+        ss << "Not advisable to continue since no OSDs are up. Pass "
+           << "--yes-i-really-mean-it if you really wish to continue.";
+        err = -EPERM;
+        goto reply;
+      }
+      // The release check here is required because for OSD_PGLOG_HARDLIMIT,
+      // we are reusing a jewel feature bit that was retired in luminous.
+      if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
+         (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
+          || sure == "--yes-i-really-mean-it")) {
+       return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
+      } else {
+       ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
+       err = -EPERM;
+       goto reply;
+      }
      } else if (key == "require_jewel_osds") {
        if (!osdmap.get_num_up_osds() && sure != "--yes-i-really-mean-it") {
          ss << "Not advisable to continue since no OSDs are up. Pass "
@@ -9410,7 +9451,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd unset") {
      string key;
-    cmd_getval(g_ceph_context, cmdmap, "key", key);
+    cmd_getval_throws(g_ceph_context, cmdmap, "key", key);
      if (key == "full")
        return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
      else if (key == "pause")
@@ -9442,9 +9483,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
    } else if (prefix == "osd require-osd-release") {
      string release;
-    cmd_getval(g_ceph_context, cmdmap, "release", release);
+    cmd_getval_throws(g_ceph_context, cmdmap, "release", release);
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      if (!osdmap.test_flag(CEPH_OSDMAP_SORTBITWISE)) {
        ss << "the sortbitwise flag must be set first";
        err = -EPERM;
@@ -9503,7 +9544,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      bool verbose = true;
  
      vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
      for (unsigned j = 0; j < idvec.size() && !stop; j++) {
        set<int> osds;
  
@@ -9637,7 +9678,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      bool stop = false;
  
      vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
      for (unsigned j = 0; j < idvec.size() && !stop; j++) {
  
        set<int> osds;
@@ -9771,7 +9812,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      bool stop = false;
  
      vector<string> idvec;
-    cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
+    cmd_getval_throws(g_ceph_context, cmdmap, "ids", idvec);
  
      for (unsigned j = 0; j < idvec.size() && !stop; j++) {
  
@@ -9915,7 +9956,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
    } else if (prefix == "osd pg-temp") {
      string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
        ss << "unable to parse 'pgid' value '"
           << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
        err = -EINVAL;
@@ -9940,7 +9981,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      vector<int64_t> id_vec;
      vector<int32_t> new_pg_temp;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
        ss << "unable to parse 'id' value(s) '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        err = -EINVAL;
@@ -9977,7 +10018,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      goto update;
    } else if (prefix == "osd primary-temp") {
      string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
        ss << "unable to parse 'pgid' value '"
           << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
        err = -EINVAL;
@@ -9996,7 +10037,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      int64_t osd;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", osd)) {
        ss << "unable to parse 'id' value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        err = -EINVAL;
@@ -10049,7 +10090,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err < 0)
        goto reply;
      string pgidstr;
-    if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr)) {
        ss << "unable to parse 'pgid' value '"
           << cmd_vartype_stringify(cmdmap["pgid"]) << "'";
        err = -EINVAL;
@@ -10124,7 +10165,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      case OP_PG_UPMAP:
        {
          vector<int64_t> id_vec;
-        if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+        if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
            ss << "unable to parse 'id' value(s) '"
               << cmd_vartype_stringify(cmdmap["id"]) << "'";
            err = -EINVAL;
@@ -10184,7 +10225,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      case OP_PG_UPMAP_ITEMS:
        {
          vector<int64_t> id_vec;
-        if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
+        if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id_vec)) {
            ss << "unable to parse 'id' value(s) '"
               << cmd_vartype_stringify(cmdmap["id"]) << "'";
            err = -EINVAL;
@@ -10266,14 +10307,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      goto update;
    } else if (prefix == "osd primary-affinity") {
      int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
        ss << "invalid osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        err = -EINVAL;
        goto reply;
      }
      double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
        ss << "unable to parse 'weight' value '"
             << cmd_vartype_stringify(cmdmap["weight"]) << "'";
        err = -EINVAL;
@@ -10316,14 +10357,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
    } else if (prefix == "osd reweight") {
      int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
        ss << "unable to parse osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        err = -EINVAL;
        goto reply;
      }
      double w;
-    if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "weight", w)) {
        ss << "unable to parse weight value '"
           << cmd_vartype_stringify(cmdmap["weight"]) << "'";
        err = -EINVAL;
@@ -10362,14 +10403,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      return true;
    } else if (prefix == "osd lost") {
      int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
        ss << "unable to parse osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "'";
        err = -EINVAL;
        goto reply;
      }
      string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
        ss << "are you SURE?  this might mean real, permanent data loss.  pass "
             "--yes-i-really-mean-it if you really do.";
        err = -EPERM;
@@ -10414,7 +10455,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      int64_t id;
-    if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "id", id)) {
        ss << "unable to parse osd id value '"
           << cmd_vartype_stringify(cmdmap["id"]) << "";
        err = -EINVAL;
@@ -10427,7 +10468,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      string sure;
-    if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) ||
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure) ||
          sure != "--yes-i-really-mean-it") {
        ss << "Are you SURE? This will mean real, permanent data loss, as well "
           << "as cephx and lockbox keys. Pass --yes-i-really-mean-it if you "
@@ -10533,7 +10574,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      // optional id provided?
      int64_t id = -1, cmd_id = -1;
-    if (cmd_getval(g_ceph_context, cmdmap, "id", cmd_id)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "id", cmd_id)) {
        if (cmd_id < 0) {
         ss << "invalid osd id value '" << cmd_id << "'";
         err = -EINVAL;
@@ -10544,7 +10585,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      uuid_d uuid;
      string uuidstr;
-    if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
+    if (cmd_getval_throws(g_ceph_context, cmdmap, "uuid", uuidstr)) {
        if (!uuid.parse(uuidstr.c_str())) {
          ss << "invalid uuid value '" << uuidstr << "'";
          err = -EINVAL;
@@ -10611,7 +10652,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      return true;
    } else if (prefix == "osd blacklist") {
      string addrstr;
-    cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "addr", addrstr);
      entity_addr_t addr;
      if (!addr.parse(addrstr.c_str(), 0)) {
        ss << "unable to parse address " << addrstr;
@@ -10620,12 +10661,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
      else {
        string blacklistop;
-      cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
+      cmd_getval_throws(g_ceph_context, cmdmap, "blacklistop", blacklistop);
        if (blacklistop == "add") {
         utime_t expires = ceph_clock_now();
         double d;
         // default one hour
-       cmd_getval(g_ceph_context, cmdmap, "expire", d,
+       cmd_getval_throws(g_ceph_context, cmdmap, "expire", d,
            g_conf->mon_osd_blacklist_default_expire);
         expires += d;
  
@@ -10665,7 +10706,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
    } else if (prefix == "osd pool mksnap") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
      if (pool < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -10673,7 +10714,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string snapname;
-    cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
      const pg_pool_t *p = osdmap.get_pg_pool(pool);
      if (p->is_unmanaged_snaps_mode()) {
        ss << "pool " << poolstr << " is in unmanaged snaps mode";
@@ -10708,7 +10749,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      return true;
    } else if (prefix == "osd pool rmsnap") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
      if (pool < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -10716,7 +10757,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string snapname;
-    cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
+    cmd_getval_throws(g_ceph_context, cmdmap, "snap", snapname);
      const pg_pool_t *p = osdmap.get_pg_pool(pool);
      if (p->is_unmanaged_snaps_mode()) {
        ss << "pool " << poolstr << " is in unmanaged snaps mode";
@@ -10749,16 +10790,16 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd pool create") {
      int64_t  pg_num;
      int64_t pgp_num;
-    cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
-    cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
  
      string pool_type_str;
-    cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool_type", pool_type_str);
      if (pool_type_str.empty())
        pool_type_str = g_conf->osd_pool_default_type;
  
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id >= 0) {
        const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
@@ -10793,9 +10834,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      bool implicit_rule_creation = false;
      int64_t expected_num_objects = 0;
      string rule_name;
-    cmd_getval(g_ceph_context, cmdmap, "rule", rule_name);
+    cmd_getval_throws(g_ceph_context, cmdmap, "rule", rule_name);
      string erasure_code_profile;
-    cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
+    cmd_getval_throws(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
  
      if (pool_type == pg_pool_t::TYPE_ERASURE) {
        if (erasure_code_profile == "")
@@ -10829,7 +10870,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
           rule_name = poolstr;
         }
        }
-      cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+      cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
                   expected_num_objects, int64_t(0));
      } else {
        //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
@@ -10846,7 +10887,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
          }
          rule_name = erasure_code_profile;
        } else { // cmd is well-formed
-        cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
+        cmd_getval_throws(g_ceph_context, cmdmap, "expected_num_objects",
                     expected_num_objects, int64_t(0));
        }
      }
@@ -10888,7 +10929,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      int64_t fast_read_param;
-    cmd_getval(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
+    cmd_getval_throws(g_ceph_context, cmdmap, "fast_read", fast_read_param, int64_t(-1));
      FastReadType fast_read = FAST_READ_DEFAULT;
      if (fast_read_param == 0)
        fast_read = FAST_READ_OFF;
@@ -10929,9 +10970,9 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
               prefix == "osd pool rm") {
      // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
      string poolstr, poolstr2, sure;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
-    cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool2", poolstr2);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
      if (pool < 0) {
        ss << "pool '" << poolstr << "' does not exist";
@@ -10958,8 +10999,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      goto update;
    } else if (prefix == "osd pool rename") {
      string srcpoolstr, destpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
-    cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "destpool", destpoolstr);
      int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
      int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
  
@@ -11016,7 +11057,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err)
        goto reply;
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11024,7 +11065,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
      int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
      if (tierpool_id < 0) {
        ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11042,7 +11083,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      // make sure new tier is empty
      string force_nonempty;
-    cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
+    cmd_getval_throws(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
      const pool_stat_t *pstats = mon->pgservice->get_pool_stat(tierpool_id);
      if (pstats && pstats->stats.sum.num_objects != 0 &&
         force_nonempty != "--force-nonempty") {
@@ -11080,7 +11121,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd tier remove" ||
               prefix == "osd tier rm") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11088,7 +11129,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
      int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
      if (tierpool_id < 0) {
        ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11144,7 +11185,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err)
        goto reply;
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11152,7 +11193,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string overlaypoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
      int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
      if (overlaypool_id < 0) {
        ss << "unrecognized pool '" << overlaypoolstr << "'";
@@ -11197,7 +11238,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd tier remove-overlay" ||
               prefix == "osd tier rm-overlay") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11242,7 +11283,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err)
        goto reply;
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11257,7 +11298,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string modestr;
-    cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "mode", modestr);
      pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
      if (mode < 0) {
        ss << "'" << modestr << "' is not a valid cache mode";
@@ -11266,7 +11307,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      string sure;
-    cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+    cmd_getval_throws(g_ceph_context, cmdmap, "sure", sure);
      if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
          mode != pg_pool_t::CACHEMODE_NONE &&
          mode != pg_pool_t::CACHEMODE_PROXY &&
@@ -11392,7 +11433,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      if (err)
        goto reply;
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11400,7 +11441,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
        goto reply;
      }
      string tierpoolstr;
-    cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
      int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
      if (tierpool_id < 0) {
        ss << "unrecognized pool '" << tierpoolstr << "'";
@@ -11417,7 +11458,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      int64_t size = 0;
-    if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
+    if (!cmd_getval_throws(g_ceph_context, cmdmap, "size", size)) {
        ss << "unable to parse 'size' value '"
           << cmd_vartype_stringify(cmdmap["size"]) << "'";
        err = -EINVAL;
@@ -11482,7 +11523,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      return true;
    } else if (prefix == "osd pool set-quota") {
      string poolstr;
-    cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pool", poolstr);
      int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
      if (pool_id < 0) {
        ss << "unrecognized pool '" << poolstr << "'";
@@ -11491,7 +11532,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
      }
  
      string field;
-    cmd_getval(g_ceph_context, cmdmap, "field", field);
+    cmd_getval_throws(g_ceph_context, cmdmap, "field", field);
      if (field != "max_objects" && field != "max_bytes") {
        ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
        err = -EINVAL;
@@ -11500,7 +11541,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
  
      // val could contain unit designations, so we treat as a string
      string val;
-    cmd_getval(g_ceph_context, cmdmap, "val", val);
+    cmd_getval_throws(g_ceph_context, cmdmap, "val", val);
      string tss;
      int64_t value;
      if (field == "max_objects") {
@@ -11615,7 +11656,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
    } else if (prefix == "osd force-create-pg") {
      pg_t pgid;
      string pgidstr;
-    cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+    cmd_getval_throws(g_ceph_context, cmdmap, "pgid", pgidstr);
      if (!pgid.parse(pgidstr.c_str())) {
        ss << "invalid pgid '" << pgidstr << "'";
        err = -EINVAL;
diff --git a/ceph/src/os/bluestore/BlueFS.cc b/ceph/src/os/bluestore/BlueFS.cc

index cf9e1d60392a3af0578a552462654f41f2ce5cb1..cf3063aa88b80d716414a1a503ddca6b1515df12 100644 (file)
--- a/ceph/src/os/bluestore/BlueFS.cc
+++ b/ceph/src/os/bluestore/BlueFS.cc
@@ -299,6 +299,30 @@ int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
    return 0;
  }
  
+// returns true if specified device is attached
+bool BlueFS::is_device(unsigned id)
+{
+  return !(id >= MAX_BDEV || bdev[id] == nullptr);
+}
+
+// returns true if specified device is under full bluefs control
+// and hence can be expanded
+bool BlueFS::is_device_expandable(unsigned id)
+{
+  if (id >= MAX_BDEV || bdev[id] == nullptr) {
+    return false;
+  }
+  switch(id) {
+  case BDEV_WAL:
+    return true;
+
+  case BDEV_DB:
+    // true if DB volume is non-shared
+    return bdev[BDEV_SLOW] != nullptr;
+  }
+  return false;
+}
+
  int BlueFS::mkfs(uuid_d osd_uuid)
  {
    std::unique_lock<std::mutex> l(lock);
diff --git a/ceph/src/os/bluestore/BlueFS.h b/ceph/src/os/bluestore/BlueFS.h

index ade3c0498e0ec50f3a3b9daab110c8d388eb9ede..033a4930c99777d3083680eb743a83fd73713038 100644 (file)
--- a/ceph/src/os/bluestore/BlueFS.h
+++ b/ceph/src/os/bluestore/BlueFS.h
@@ -347,6 +347,13 @@ public:
    /// get current extents that we own for given block device
    int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
  
+  // returns true if specified device is attached
+  bool is_device(unsigned id);
+  
+  // returns true if specified device is under full bluefs control
+  // and hence can be expanded
+  bool is_device_expandable(unsigned id);
+
    int open_for_write(
      const string& dir,
      const string& file,
diff --git a/ceph/src/os/bluestore/BlueStore.cc b/ceph/src/os/bluestore/BlueStore.cc

index b326d39bcd2ecaa9e3bf15ca3667abb5a0b5812c..6d601aaac83ab33de5ab040b49c1724e326c1457 100644 (file)
--- a/ceph/src/os/bluestore/BlueStore.cc
+++ b/ceph/src/os/bluestore/BlueStore.cc
@@ -1370,10 +1370,8 @@ void BlueStore::BufferSpace::read(
    cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
  }
  
-void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq)
+void BlueStore::BufferSpace::_finish_write(Cache* cache, uint64_t seq)
  {
-  std::lock_guard<std::recursive_mutex> l(cache->lock);
-
    auto i = writing.begin();
    while (i != writing.end()) {
      if (i->seq > seq) {
@@ -1648,6 +1646,23 @@ void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
    }
  }
  
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+  while (true) {
+    Cache *cache = coll->cache;
+    std::lock_guard<std::recursive_mutex> l(cache->lock);
+    if (coll->cache != cache) {
+      ldout(coll->store->cct, 20) << __func__
+                                 << " raced with sb cache update, was " << cache
+                                 << ", now " << coll->cache << ", retrying"
+                                 << dendl;
+      continue;
+    }
+    bc._finish_write(cache, seq);
+    break;
+  }
+}
+
  // SharedBlobSet
  
  #undef dout_prefix
@@ -3418,8 +3433,12 @@ void BlueStore::MempoolThread::_tune_cache_size(bool interval_stats)
    uint64_t target = store->osd_memory_target;
    uint64_t base = store->osd_memory_base;
    double fragmentation = store->osd_memory_expected_fragmentation;
-  uint64_t cache_max = ((1.0 - fragmentation) * target) - base;
    uint64_t cache_min = store->osd_memory_cache_min;
+  uint64_t cache_max = cache_min;
+  uint64_t limited_target = (1.0 - fragmentation) * target;
+  if (limited_target > base + cache_min) {
+    cache_max = limited_target - base;
+  }
  
    size_t heap_size = 0;
    size_t unmapped = 0;
@@ -4171,6 +4190,8 @@ void BlueStore::_init_logger()
                     "collection");
    b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
                      "Read EIO errors propagated to high level callers");
+  b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+                    "Read operations that required at least one retry due to failed checksum validation");
    logger = b.create_perf_counters();
    cct->get_perfcounters_collection()->add(logger);
  }
@@ -5059,6 +5080,18 @@ int BlueStore::_reconcile_bluefs_freespace()
    return 0;
  }
  
+void BlueStore::_dump_alloc_on_rebalance_failure()
+{
+  auto dump_interval =
+    cct->_conf->bluestore_bluefs_balance_failure_dump_interval;
+  if (dump_interval > 0 &&
+    next_dump_on_bluefs_balance_failure <= ceph_clock_now()) {
+    alloc->dump();
+    next_dump_on_bluefs_balance_failure = ceph_clock_now();
+    next_dump_on_bluefs_balance_failure += dump_interval;
+  }
+}
+
  int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
  {
    int ret = 0;
@@ -5147,18 +5180,18 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
                                         0, 0, &exts);
  
      if (alloc_len <= 0) {
-      dout(1) << __func__ << " no allocate on 0x" << std::hex << gift
+      dout(0) << __func__ << " no allocate on 0x" << std::hex << gift
                << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
        alloc->unreserve(gift);
-      alloc->dump();
+      _dump_alloc_on_rebalance_failure();
        return 0;
      } else if (alloc_len < (int64_t)gift) {
-      dout(1) << __func__ << " insufficient allocate on 0x" << std::hex << gift
+      dout(0) << __func__ << " insufficient allocate on 0x" << std::hex << gift
                << " min_alloc_size 0x" << min_alloc_size 
               << " allocated 0x" << alloc_len
               << std::dec << dendl;
        alloc->unreserve(gift - alloc_len);
-      alloc->dump();
+      _dump_alloc_on_rebalance_failure();
      }
      for (auto& p : exts) {
        bluestore_pextent_t e = bluestore_pextent_t(p);
@@ -6698,7 +6731,8 @@ int BlueStore::_do_read(
    uint64_t offset,
    size_t length,
    bufferlist& bl,
-  uint32_t op_flags)
+  uint32_t op_flags,
+  uint64_t retry_count)
  {
    FUNCTRACE();
    int r = 0;
@@ -6919,7 +6953,14 @@ int BlueStore::_do_read(
        bufferlist& compressed_bl = *p++;
        if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
                        b2r_it->second.front().logical_offset) < 0) {
-       return -EIO;
+        // Handles spurious read errors caused by a kernel bug.
+        // We sometimes get all-zero pages as a result of the read under
+        // high memory pressure. Retrying the failing read succeeds in most cases.
+        // See also: http://tracker.ceph.com/issues/22464
+        if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+          return -EIO;
+        }
+        return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
        }
        bufferlist raw_bl;
        r = _decompress(compressed_bl, &raw_bl);
@@ -6937,7 +6978,14 @@ int BlueStore::_do_read(
        for (auto& reg : b2r_it->second) {
         if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
                          reg.logical_offset) < 0) {
-         return -EIO;
+          // Handles spurious read errors caused by a kernel bug.
+          // We sometimes get all-zero pages as a result of the read under
+          // high memory pressure. Retrying the failing read succeeds in most cases.
+          // See also: http://tracker.ceph.com/issues/22464
+          if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+            return -EIO;
+          }
+          return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
         }
         if (buffered) {
           bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
@@ -6981,6 +7029,11 @@ int BlueStore::_do_read(
    assert(pos == length);
    assert(pr == pr_end);
    r = bl.length();
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+            << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+  }
    return r;
  }
  
@@ -6993,6 +7046,13 @@ int BlueStore::_verify_csum(OnodeRef& o,
    uint64_t bad_csum;
    utime_t start = ceph_clock_now();
    int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+  if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+      (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+    derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+    bad = blob_xoffset;
+    r = -1;
+    bad_csum = 0xDEADBEEF;
+  }
    if (r < 0) {
      if (r == -1) {
        PExtentVector pex;
@@ -8398,7 +8458,7 @@ void BlueStore::_txc_finish(TransContext *txc)
    assert(txc->state == TransContext::STATE_FINISHING);
  
    for (auto& sb : txc->shared_blobs_written) {
-    sb->bc.finish_write(sb->get_cache(), txc->seq);
+    sb->finish_write(txc->seq);
    }
    txc->shared_blobs_written.clear();
  
@@ -10608,6 +10668,8 @@ void BlueStore::_choose_write_options(
  
    dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
             << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+          << " compress=" << (int)wctx->compress
+          << " buffered=" << (int)wctx->buffered
             << std::dec << dendl;
  }
  
@@ -11551,6 +11613,11 @@ int BlueStore::_rename(TransContext *txc,
    c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
    r = 0;
  
+  // hold a ref to new Onode in old name position, to ensure we don't drop
+  // it from the cache before this txc commits (or else someone may come along
+  // and read newo's metadata via the old name).
+  txc->note_modified_object(oldo);
+
   out:
    dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
            << new_oid << " = " << r << dendl;
diff --git a/ceph/src/os/bluestore/BlueStore.h b/ceph/src/os/bluestore/BlueStore.h

index 6a14042df107d6f4c419eb609457fb1f9883b7ad..8f78594d98b9e1a095ca84279e8b90a146fd27d9 100644 (file)
--- a/ceph/src/os/bluestore/BlueStore.h
+++ b/ceph/src/os/bluestore/BlueStore.h
@@ -118,6 +118,7 @@ enum {
    l_bluestore_extent_compress,
    l_bluestore_gc_merged,
    l_bluestore_read_eio,
+  l_bluestore_reads_with_retries,
    l_bluestore_last
  };
  
@@ -333,7 +334,7 @@ public:
        b->cache_private = _discard(cache, offset, bl.length());
        _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
      }
-    void finish_write(Cache* cache, uint64_t seq);
+    void _finish_write(Cache* cache, uint64_t seq);
      void did_read(Cache* cache, uint32_t offset, bufferlist& bl) {
        std::lock_guard<std::recursive_mutex> l(cache->lock);
        Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
@@ -410,6 +411,8 @@ public:
      void put_ref(uint64_t offset, uint32_t length,
                  PExtentVector *r, set<SharedBlob*> *maybe_unshared_blobs);
  
+    void finish_write(uint64_t seq);
+
      friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
        return l.get_sbid() == r.get_sbid();
      }
@@ -1825,6 +1828,7 @@ private:
    unsigned bluefs_shared_bdev = 0;  ///< which bluefs bdev we are sharing
    bool bluefs_single_shared_device = true;
    utime_t bluefs_last_balance;
+  utime_t next_dump_on_bluefs_balance_failure;
  
    KeyValueDB *db = nullptr;
    BlockDevice *bdev = nullptr;
@@ -2124,6 +2128,7 @@ private:
  
    void _open_statfs();
  
+  void _dump_alloc_on_rebalance_failure();
    int _reconcile_bluefs_freespace();
    int _balance_bluefs_freespace(PExtentVector *extents);
    void _commit_bluefs_freespace(const PExtentVector& extents);
@@ -2366,7 +2371,8 @@ public:
      uint64_t offset,
      size_t len,
      bufferlist& bl,
-    uint32_t op_flags = 0);
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
  
  private:
    int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
diff --git a/ceph/src/os/bluestore/bluestore_tool.cc b/ceph/src/os/bluestore/bluestore_tool.cc

index 1320df831b83f5682e4bc155452a220e41b5f41e..746ba8222ba90045fd642638af1771eff2352a09 100644 (file)
--- a/ceph/src/os/bluestore/bluestore_tool.cc
+++ b/ceph/src/os/bluestore/bluestore_tool.cc
@@ -66,6 +66,28 @@ void validate_path(CephContext *cct, const string& path, bool bluefs)
    }
  }
  
+const char* find_device_path(
+  int id,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+          << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+        (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+        (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+      return i.c_str();
+    }
+  }
+  return nullptr;
+}
+
  BlueFS *open_bluefs(
    CephContext *cct,
    const string& path,
@@ -277,7 +299,9 @@ int main(int argc, char **argv)
    env_to_vec(args);
  
    auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
-                        CODE_ENVIRONMENT_UTILITY, 0);
+                        CODE_ENVIRONMENT_UTILITY,
+                        CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
    common_init_finish(cct.get());
  
    if (action == "fsck" ||
@@ -403,7 +427,22 @@ int main(int argc, char **argv)
            << cpp_strerror(r) << std::endl;
        exit(EXIT_FAILURE);
      }
-    label.meta[key] = value;
+    if (key == "size") {
+      label.size = strtoull(value.c_str(), nullptr, 10);
+    } else if (key =="osd_uuid") {
+      label.osd_uuid.parse(value.c_str());
+    } else if (key =="btime") {
+      uint64_t epoch;
+      uint64_t nsec;
+      int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+      if (r == 0) {
+       label.btime = utime_t(epoch, nsec);
+      }
+    } else if (key =="description") {
+      label.description = value;
+    } else {
+      label.meta[key] = value;
+    }
      r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
      if (r < 0) {
        cerr << "unable to write label for " << devs.front() << ": "
@@ -438,17 +477,51 @@ int main(int argc, char **argv)
    }
    else if (action == "bluefs-bdev-expand") {
      BlueFS *fs = open_bluefs(cct.get(), path, devs);
-    cout << "start:" << std::endl;
      fs->dump_block_extents(cout);
+    cout << "Expanding..." << std::endl;
      for (int devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB }) {
+      if (!fs->is_device(devid)) {
+        continue;
+      }
+      if (!fs->is_device_expandable(devid)) {
+       cout << devid
+            << " : can't be expanded. Bypassing..."
+            << std::endl;
+       continue;
+      }
        interval_set<uint64_t> before;
        fs->get_block_extents(devid, &before);
+      assert(!before.empty());
        uint64_t end = before.range_end();
        uint64_t size = fs->get_block_device_size(devid);
        if (end < size) {
-       cout << "expanding dev " << devid << " from 0x" << std::hex
+       cout << devid
+            <<" : expanding " << " from 0x" << std::hex
              << end << " to 0x" << size << std::dec << std::endl;
         fs->add_block_extent(devid, end, size-end);
+       const char* path = find_device_path(devid, cct.get(), devs);
+       if (path == nullptr) {
+         cerr << devid
+              <<": can't find device path " << std::endl;
+         continue;
+       }
+       bluestore_bdev_label_t label;
+       int r = BlueStore::_read_bdev_label(cct.get(), path, &label);
+       if (r < 0) {
+         cerr << "unable to read label for " << path << ": "
+               << cpp_strerror(r) << std::endl;
+         continue;
+       }
+        label.size = size;
+       r = BlueStore::_write_bdev_label(cct.get(), path, label);
+       if (r < 0) {
+         cerr << "unable to write label for " << path << ": "
+               << cpp_strerror(r) << std::endl;
+         continue;
+       }
+       cout << devid
+            <<" : size label updated to " << size
+            << std::endl;
        }
      }
      delete fs;
diff --git a/ceph/src/os/filestore/LFNIndex.h b/ceph/src/os/filestore/LFNIndex.h

index bbab0d097be0e1705aab5dd61c5c58a6be21d3a9..2f2d2e0f8feab27a9972d2a94cd292aba44009b6 100644 (file)
--- a/ceph/src/os/filestore/LFNIndex.h
+++ b/ceph/src/os/filestore/LFNIndex.h
@@ -63,7 +63,7 @@
        out:                                     \
        complete_inject_failure();               \
        return r;                                        \
-    } catch (RetryException) {                 \
+    } catch (RetryException&) {                        \
        failed = true;                           \
      } catch (...) {                            \
        ceph_abort();                            \
diff --git a/ceph/src/osd/OSD.cc b/ceph/src/osd/OSD.cc

index 7d737433784c88616f8bc0dd1c5972bb5b3a8bf8..0f968d33364547d4e7729e8409849ec27a36a2d2 100644 (file)
--- a/ceph/src/osd/OSD.cc
+++ b/ceph/src/osd/OSD.cc
@@ -215,7 +215,7 @@ OSDService::OSDService(OSD *osd) :
    monc(osd->monc),
    peering_wq(osd->peering_wq),
    recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
-                 &osd->disk_tp),
+                 &osd->recovery_tp),
    class_handler(osd->class_handler),
    pg_epoch_lock("OSDService::pg_epoch_lock"),
    publish_lock("OSDService::publish_lock"),
@@ -1971,7 +1971,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
              "osd_peering_tp_threads"),
    osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
             get_num_op_threads()),
-  disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
+  remove_tp(cct, "OSD::remove_tp", "tp_osd_remove", cct->_conf->osd_remove_threads, "osd_remove_threads"),
+  recovery_tp(cct, "OSD::recovery_tp", "tp_osd_recovery", cct->_conf->osd_recovery_threads, "osd_recovery_threads"),
    command_tp(cct, "OSD::command_tp", "tp_osd_cmd",  1),
    session_waiting_lock("OSD::session_waiting_lock"),
    osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
@@ -2022,7 +2023,7 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
      store,
      cct->_conf->osd_remove_thread_timeout,
      cct->_conf->osd_remove_thread_suicide_timeout,
-    &disk_tp),
+    &remove_tp),
    service(this)
  {
    monc->set_messenger(client_messenger);
@@ -2679,7 +2680,8 @@ int OSD::init()
    service.max_oldest_map = superblock.oldest_map;
  
    osd_op_tp.start();
-  disk_tp.start();
+  remove_tp.start();
+  recovery_tp.start();
    command_tp.start();
  
    set_disk_tp_priority();
@@ -3434,9 +3436,13 @@ int OSD::shutdown()
    command_tp.stop();
    dout(10) << "command tp stopped" << dendl;
  
-  disk_tp.drain();
-  disk_tp.stop();
-  dout(10) << "disk tp paused (new)" << dendl;
+  remove_tp.drain();
+  remove_tp.stop();
+  dout(10) << "remove tp paused (new)" << dendl;
+
+  recovery_tp.drain();
+  recovery_tp.stop();
+  dout(10) << "recovery tp paused (new)" << dendl;
  
    dout(10) << "stopping agent" << dendl;
    service.agent_stop();
@@ -3501,7 +3507,10 @@ int OSD::shutdown()
  #ifdef PG_DEBUG_REFS
    service.dump_live_pgids();
  #endif
+
+  osd_lock.Unlock();
    cct->_conf->remove_observer(this);
+  osd_lock.Lock();
  
    dout(10) << "syncing store" << dendl;
    enable_disable_fuse(true);
@@ -5255,11 +5264,10 @@ void OSD::heartbeat()
  
  bool OSD::heartbeat_reset(Connection *con)
  {
+  Mutex::Locker l(heartbeat_lock);
    HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
    if (s) {
-    heartbeat_lock.Lock();
      if (is_stopping()) {
-      heartbeat_lock.Unlock();
        s->put();
        return true;
      }
@@ -5293,7 +5301,6 @@ bool OSD::heartbeat_reset(Connection *con)
      } else {
        dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
      }
-    heartbeat_lock.Unlock();
      s->put();
    }
    return true;
@@ -5431,6 +5438,7 @@ void OSD::tick_without_osd_lock()
      }
    }
  
+  check_ops_in_flight();
    mgrc.update_osd_health(get_health_metrics());
    service.kick_recovery_queue();
    tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
@@ -5707,6 +5715,12 @@ bool remove_dir(
        cont = dstate->pause_clearing();
        handle.suspend_tp_timeout();
        waiter.wait();
+      if (cct->_conf->osd_delete_sleep) {
+        utime_t t;
+        t.set_from_double(cct->_conf->osd_delete_sleep);
+        lgeneric_subdout(cct, osd, 10) << __func__ << " inject delay of " << t << dendl;
+        t.sleep();
+      }
        handle.reset_tp_timeout();
        if (cont)
          cont = dstate->resume_clearing();
@@ -6585,6 +6599,22 @@ COMMAND("compact",
          "osd", "rw", "cli,rest")
  };
  
+namespace {
+  class unlock_guard {
+    Mutex& m;
+  public:
+    explicit unlock_guard(Mutex& mutex)
+      : m(mutex)
+    {
+      m.Unlock();
+    }
+    unlock_guard(unlock_guard&) = delete;
+    ~unlock_guard() {
+      m.Lock();
+    }
+  };
+}
+
  void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
  {
    int r = 0;
@@ -6658,21 +6688,19 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
      string args = argsvec.front();
      for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
        args += " " + *a;
-    osd_lock.Unlock();
+    unlock_guard unlock{osd_lock};
      r = cct->_conf->injectargs(args, &ss);
-    osd_lock.Lock();
    }
    else if (prefix == "config set") {
      std::string key;
      std::string val;
      cmd_getval(cct, cmdmap, "key", key);
      cmd_getval(cct, cmdmap, "value", val);
-    osd_lock.Unlock();
+    unlock_guard unlock{osd_lock};
      r = cct->_conf->set_val(key, val, true, &ss);
      if (r == 0) {
        cct->_conf->apply_changes(nullptr);
      }
-    osd_lock.Lock();
    }
    else if (prefix == "cluster_log") {
      vector<string> msg;
@@ -6966,6 +6994,7 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
      cmd_getval(cct, cmdmap, "delay", delay);
      ostringstream oss;
      oss << delay;
+    unlock_guard unlock{osd_lock};
      r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
      if (r != 0) {
        ss << "kick_recovery_wq: error setting "
@@ -9969,6 +9998,7 @@ const char** OSD::get_tracked_conf_keys() const
  void OSD::handle_conf_change(const struct md_config_t *conf,
                              const std::set <std::string> &changed)
  {
+  Mutex::Locker l(osd_lock);
    if (changed.count("osd_max_backfills")) {
      service.local_reserver.set_max(cct->_conf->osd_max_backfills);
      service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
@@ -10099,12 +10129,14 @@ void OSD::set_disk_tp_priority()
      return;
    int cls =
      ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
-  if (cls < 0)
+  if (cls < 0) {
      derr << __func__ << cpp_strerror(cls) << ": "
          << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
          << " but only the following values are allowed: idle, be or rt" << dendl;
-  else
-    disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+  } else {
+    remove_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+    recovery_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+  }
  }
  
  // --------------------------------
diff --git a/ceph/src/osd/OSD.h b/ceph/src/osd/OSD.h

index b6d2482bcedb663acb986821a008c2c066ed8ba2..f445a2b989a07fecb6f1897f9cceed674ae84782 100644 (file)
--- a/ceph/src/osd/OSD.h
+++ b/ceph/src/osd/OSD.h
@@ -1393,7 +1393,8 @@ private:
  
    ThreadPool peering_tp;
    ShardedThreadPool osd_op_tp;
-  ThreadPool disk_tp;
+  ThreadPool remove_tp;
+  ThreadPool recovery_tp;
    ThreadPool command_tp;
  
    void set_disk_tp_priority();
diff --git a/ceph/src/osd/OSDMap.cc b/ceph/src/osd/OSDMap.cc

index f6cf25fd9b27f1e0b6f48b5f89dfe98fb610ba7e..3b7ddfa4a40fab9f6ffd0320ca0a5817d7abcb97 100644 (file)
--- a/ceph/src/osd/OSDMap.cc
+++ b/ceph/src/osd/OSDMap.cc
@@ -1637,12 +1637,12 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
      to_check.insert(p.first);
    }
    for (auto& pg : to_check) {
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
-    if (crush_rule < 0) {
-      lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << pg << dendl;
+    if (!tmpmap.pg_exists(pg)) {
+      ldout(cct, 0) << __func__ << " pg " << pg << " is gone" << dendl;
+      to_cancel.insert(pg);
        continue;
      }
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
      map<int, float> weight_map;
      auto it = rule_weight_map.find(crush_rule);
      if (it == rule_weight_map.end()) {
@@ -1672,19 +1672,23 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
      tmpmap.pg_to_raw_up(pg, &raw, &primary);
      set<int> parents;
      for (auto osd : raw) {
+      // skip non-existent/down osd for erasure-coded PGs
+      if (osd == CRUSH_ITEM_NONE)
+        continue;
        if (type > 0) {
          auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
-        if (parent >= 0) {
+        if (parent < 0) {
+          auto r = parents.insert(parent);
+          if (!r.second) {
+            // two up-set osds come from same parent
+            to_cancel.insert(pg);
+            break;
+          }
+        } else {
            lderr(cct) << __func__ << " unable to get parent of raw osd."
                       << osd << " of pg " << pg
                       << dendl;
-          break;
-        }
-        auto r = parents.insert(parent);
-        if (!r.second) {
-          // two up-set osds come from same parent
-          to_cancel.insert(pg);
-          break;
+          // continue to do checks below
          }
        }
        // the above check validates collision only
@@ -1740,6 +1744,7 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
        }
      }
    }
+  tmpmap.clean_pg_upmaps(cct, pending_inc);
  }
  
  int OSDMap::apply_incremental(const Incremental &inc)
@@ -3148,6 +3153,8 @@ string OSDMap::get_flag_string(unsigned f)
      s += ",recovery_deletes";
    if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
      s += ",purged_snapdirs";
+  if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
+    s += ",pglog_hardlimit";
    if (s.length())
      s.erase(0, 1);
    return s;
@@ -3905,7 +3912,7 @@ int OSDMap::summarize_mapping_stats(
  
  int OSDMap::clean_pg_upmaps(
    CephContext *cct,
-  Incremental *pending_inc)
+  Incremental *pending_inc) const
  {
    ldout(cct, 10) << __func__ << dendl;
    int changed = 0;
@@ -3926,9 +3933,16 @@ int OSDMap::clean_pg_upmaps(
      pg_to_raw_osds(p.first, &raw, &primary);
      mempool::osdmap::vector<pair<int,int>> newmap;
      for (auto& q : p.second) {
-      if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
-       newmap.push_back(q);
+      if (std::find(raw.begin(), raw.end(), q.first) == raw.end()) {
+        // cancel mapping if source osd does not exist anymore
+        continue;
+      }
+      if (q.second != CRUSH_ITEM_NONE && q.second < max_osd &&
+          q.second >= 0 && osd_weight[q.second] == 0) {
+        // cancel mapping if target osd is out
+        continue;
        }
+      newmap.push_back(q);
      }
      if (newmap.empty()) {
        ldout(cct, 10) << " removing no-op pg_upmap_items " << p.first << " "
@@ -4069,6 +4083,8 @@ int OSDMap::calc_pg_upmaps(
      multimap<float,int> deviation_osd;  // deviation(pgs), osd
      set<int> overfull;
      for (auto& i : pgs_by_osd) {
+      // make sure osd is still there (belongs to this crush-tree)
+      assert(osd_weight.count(i.first));
        float target = osd_weight[i.first] * pgs_per_weight;
        float deviation = (float)i.second.size() - target;
        ldout(cct, 20) << " osd." << i.first
@@ -4107,8 +4123,6 @@ int OSDMap::calc_pg_upmaps(
      for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
        int osd = p->second;
        float deviation = p->first;
-      // make sure osd is still there (belongs to this crush-tree)
-      assert(osd_weight.count(osd));
        float target = osd_weight[osd] * pgs_per_weight;
        assert(target > 0);
        if (deviation/target < max_deviation_ratio) {
diff --git a/ceph/src/osd/OSDMap.h b/ceph/src/osd/OSDMap.h

index 2a5f985c5fc9aa5af57c343bed786389fa093f3d..5e3fe3d28642b9e5e67780161b47c6545548ba10 100644 (file)
--- a/ceph/src/osd/OSDMap.h
+++ b/ceph/src/osd/OSDMap.h
@@ -1313,7 +1313,7 @@ public:
  
    int clean_pg_upmaps(
      CephContext *cct,
-    Incremental *pending_inc);
+    Incremental *pending_inc) const;
  
    bool try_pg_upmap(
      CephContext *cct,
@@ -1333,6 +1333,11 @@ public:
  
    int get_osds_by_bucket_name(const string &name, set<int> *osds) const;
  
+  bool have_pg_upmaps(pg_t pg) const {
+    return pg_upmap.count(pg) ||
+      pg_upmap_items.count(pg);
+  }
+
    /*
     * handy helpers to build simple maps...
     */
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc

index d2760c1aedd0adc4e9444fc9395cd19a5b1c8a32..7965757d7bd461a3d110d96c750d4ed3987ca64a 100644 (file)
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -3348,33 +3348,6 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)
      t.omap_setkeys(coll, pgmeta_oid, km);
  }
  
-void PG::trim_log()
-{
-  assert(is_primary());
-  calc_trim_to();
-  dout(10) << __func__ << " to " << pg_trim_to << dendl;
-  if (pg_trim_to != eversion_t()) {
-    // inform peers to trim log
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
-        ++i) {
-      if (*i == pg_whoami) continue;
-      osd->send_message_osd_cluster(
-       i->osd,
-       new MOSDPGTrim(
-         get_osdmap()->get_epoch(),
-         spg_t(info.pgid.pgid, i->shard),
-         pg_trim_to),
-       get_osdmap()->get_epoch());
-    }
-
-    // trim primary as well
-    pg_log.trim(pg_trim_to, info);
-    dirty_info = true;
-  }
-}
-
  void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
  {
    // raise last_complete only if we were previously up to date
@@ -3458,7 +3431,14 @@ void PG::append_log(
         roll_forward_to));
    }
  
-  pg_log.trim(trim_to, info);
+  dout(10) << __func__ << " approx pg log length =  "
+           << pg_log.get_log().approx_size() << dendl;
+  dout(10) << __func__ << " transaction_applied = "
+           << transaction_applied << dendl;
+  if (!transaction_applied)
+    dout(10) << __func__ << " " << pg_whoami
+             << " is backfill target" << dendl;
+  pg_log.trim(trim_to, info, transaction_applied);
  
    // update the local pg, pg log
    dirty_info = true;
@@ -3920,9 +3900,14 @@ void PG::reg_next_scrub()
      return;
  
    utime_t reg_stamp;
-  if (scrubber.must_scrub ||
-      (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
+  bool must = false;
+  if (scrubber.must_scrub) {
+    // Set the smallest time that isn't utime_t()
+    reg_stamp = utime_t(0,1);
+    must = true;
+  } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
      reg_stamp = ceph_clock_now();
+    must = true;
    } else {
      reg_stamp = info.history.last_scrub_stamp;
    }
@@ -3936,7 +3921,7 @@ void PG::reg_next_scrub()
                                                reg_stamp,
                                                scrub_min_interval,
                                                scrub_max_interval,
-                                              scrubber.must_scrub);
+                                              must);
  }
  
  void PG::unreg_next_scrub()
@@ -4367,7 +4352,7 @@ void PG::_scan_snaps(ScrubMap &smap)
           bool done;
           t.register_on_applied_sync(
             new C_SafeCond(&my_lock, &my_cond, &done, &r));
-         r = osd->store->apply_transaction(osr.get(), std::move(t));
+         r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
           if (r != 0) {
             derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
                  << dendl;
@@ -7553,9 +7538,6 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
      pg->publish_stats_to_osd();
    }
  
-  // trim pglog on recovered
-  pg->trim_log();
-
    // adjust acting set?  (e.g. because backfill completed...)
    bool history_les_bound = false;
    if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h

index 11bf5fc8d4fd2900a3b20143730672f4ebd51997..d782d897fedffa8c4d73a1483e8969e3d6237886 100644 (file)
--- a/ceph/src/osd/PG.h
+++ b/ceph/src/osd/PG.h
@@ -1139,6 +1139,8 @@ public:
  
    virtual void calc_trim_to() = 0;
  
+  virtual void calc_trim_to_aggressive() = 0;
+
    void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
                         pg_missing_t& omissing, pg_shard_t from);
    void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
@@ -2506,6 +2508,10 @@ protected:
      return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
    }
  
+  bool hard_limit_pglog() const {
+    return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+  }
+
    void init_primary_up_acting(
      const vector<int> &newup,
      const vector<int> &newacting,
@@ -2653,7 +2659,6 @@ public:
      ObjectStore::Transaction &t,
      bool transaction_applied = true);
    bool check_log_for_corruption(ObjectStore *store);
-  void trim_log();
  
    std::string get_corrupt_pg_log_name() const;
    static int read_info(
diff --git a/ceph/src/osd/PGLog.cc b/ceph/src/osd/PGLog.cc

index 96f49fd9d85011330de2cd036f9397c06b250727..8a6648c33eec5362a3bdddb935657ad84bff2da4 100644 (file)
--- a/ceph/src/osd/PGLog.cc
+++ b/ceph/src/osd/PGLog.cc
@@ -50,14 +50,9 @@ void PGLog::IndexedLog::trim(
    set<string>* trimmed_dups,
    eversion_t *write_from_dups)
  {
-  if (complete_to != log.end() &&
-      complete_to->version <= s) {
-    generic_dout(0) << " bad trim to " << s << " when complete_to is "
-                   << complete_to->version
-                   << " on " << *this << dendl;
-  }
-
    assert(s <= can_rollback_to);
+  if (complete_to != log.end())
+    lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
  
    auto earliest_dup_version =
      log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
@@ -68,17 +63,17 @@ void PGLog::IndexedLog::trim(
      const pg_log_entry_t &e = *log.begin();
      if (e.version > s)
        break;
-    generic_dout(20) << "trim " << e << dendl;
+    lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
      if (trimmed)
        trimmed->insert(e.version);
  
      unindex(e);         // remove from index,
  
      // add to dup list
-    generic_dout(20) << "earliest_dup_version = " << earliest_dup_version << dendl;
+    lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
      if (e.version.version >= earliest_dup_version) {
        if (write_from_dups != nullptr && *write_from_dups > e.version) {
-       generic_dout(20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+       lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
         *write_from_dups = e.version;
        }
        dups.push_back(pg_log_dup_t(e));
@@ -91,6 +86,10 @@ void PGLog::IndexedLog::trim(
        }
      }
  
+    bool reset_complete_to = false;
+    // we are trimming past complete_to, so reset complete_to
+    if (complete_to != log.end() && e.version >= complete_to->version)
+      reset_complete_to = true;
      if (rollback_info_trimmed_to_riter == log.rend() ||
         e.version == rollback_info_trimmed_to_riter->version) {
        log.pop_front();
@@ -98,13 +97,20 @@ void PGLog::IndexedLog::trim(
      } else {
        log.pop_front();
      }
+
+    // reset complete_to to the beginning of the log
+    if (reset_complete_to) {
+      lgeneric_subdout(cct, osd, 20) << " moving complete_to " << " to "
+                      << log.begin()->version << dendl;
+      complete_to = log.begin();
+    }
    }
  
    while (!dups.empty()) {
      const auto& e = *dups.begin();
      if (e.version.version >= earliest_dup_version)
        break;
-    generic_dout(20) << "trim dup " << e << dendl;
+    lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
      if (trimmed_dups)
        trimmed_dups->insert(e.get_key_name());
      if (indexed_data & PGLOG_INDEXED_DUPS) {
@@ -162,16 +168,23 @@ void PGLog::clear_info_log(
  
  void PGLog::trim(
    eversion_t trim_to,
-  pg_info_t &info)
+  pg_info_t &info,
+  bool transaction_applied)
  {
+  dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
    // trim?
    if (trim_to > log.tail) {
-    // We shouldn't be trimming the log past last_complete
-    assert(trim_to <= info.last_complete);
+    dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
+    // Don't assert for backfill_targets
+    // or whenever there are missing items
+    if (transaction_applied && (missing.num_missing() == 0))
+      assert(trim_to <= info.last_complete);
  
      dout(10) << "trim " << log << " to " << trim_to << dendl;
      log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
      info.log_tail = log.tail;
+    if (log.complete_to != log.log.end())
+      dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
    }
  }
  
diff --git a/ceph/src/osd/PGLog.h b/ceph/src/osd/PGLog.h

index 7253936ddcfa6f718258647d9590fed3222e92c7..6f85ee1f1a5505cc6585c0d1b55332f810cabbea 100644 (file)
--- a/ceph/src/osd/PGLog.h
+++ b/ceph/src/osd/PGLog.h
@@ -705,7 +705,8 @@ public:
  
    void trim(
      eversion_t trim_to,
-    pg_info_t &info);
+    pg_info_t &info,
+    bool transaction_applied = true);
  
    void roll_forward_to(
      eversion_t roll_forward_to,
diff --git a/ceph/src/osd/PrimaryLogPG.cc b/ceph/src/osd/PrimaryLogPG.cc

index a4a4f8ffb1dbe864ab37af155eb93ff70e86b62e..0074c7964bf62ad3752a7a4baea5340f1e5b54be 100644 (file)
--- a/ceph/src/osd/PrimaryLogPG.cc
+++ b/ceph/src/osd/PrimaryLogPG.cc
@@ -1569,23 +1569,23 @@ void PrimaryLogPG::calc_trim_to()
    size_t target = cct->_conf->osd_min_pg_log_entries;
    if (is_degraded() ||
        state_test(PG_STATE_RECOVERING |
-                PG_STATE_RECOVERY_WAIT |
-                PG_STATE_BACKFILLING |
-                PG_STATE_BACKFILL_WAIT |
-                PG_STATE_BACKFILL_TOOFULL)) {
+                 PG_STATE_RECOVERY_WAIT |
+                 PG_STATE_BACKFILLING |
+                 PG_STATE_BACKFILL_WAIT |
+                 PG_STATE_BACKFILL_TOOFULL)) {
      target = cct->_conf->osd_max_pg_log_entries;
    }
  
-  eversion_t limit = MIN(
+  eversion_t limit = std::min(
      min_last_complete_ondisk,
      pg_log.get_can_rollback_to());
    if (limit != eversion_t() &&
        limit != pg_trim_to &&
        pg_log.get_log().approx_size() > target) {
-    size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
-                            cct->_conf->osd_pg_log_trim_max);
+    size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+                             cct->_conf->osd_pg_log_trim_max);
      if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
-       cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+        cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
        return;
      }
      list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
@@ -1594,9 +1594,9 @@ void PrimaryLogPG::calc_trim_to()
        new_trim_to = it->version;
        ++it;
        if (new_trim_to > limit) {
-       new_trim_to = limit;
-       dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
-       break;
+        new_trim_to = limit;
+        dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+        break;
        }
      }
      dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
@@ -1606,6 +1606,63 @@ void PrimaryLogPG::calc_trim_to()
    }
  }
  
+void PrimaryLogPG::calc_trim_to_aggressive()
+{
+  size_t target = cct->_conf->osd_min_pg_log_entries;
+  if (is_degraded() ||
+      state_test(PG_STATE_RECOVERING |
+                PG_STATE_RECOVERY_WAIT |
+                PG_STATE_BACKFILLING |
+                PG_STATE_BACKFILL_WAIT |
+                PG_STATE_BACKFILL_TOOFULL)) {
+    target = cct->_conf->osd_max_pg_log_entries;
+  }
+  // limit pg log trimming up to the can_rollback_to value
+  eversion_t limit = std::min(
+    pg_log.get_head(),
+    pg_log.get_can_rollback_to());
+  dout(10) << __func__ << " limit = " << limit << dendl;
+
+  if (limit != eversion_t() &&
+      limit != pg_trim_to &&
+      pg_log.get_log().approx_size() > target) {
+    dout(10) << __func__ << " approx pg log length =  "
+             << pg_log.get_log().approx_size() << dendl;
+    size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+                                 cct->_conf->osd_pg_log_trim_max);
+    dout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
+    if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+       cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+      return;
+    }
+    auto it = pg_log.get_log().log.begin(); // oldest log entry
+    auto rit = pg_log.get_log().log.rbegin();
+    eversion_t by_n_to_keep; // start from tail
+    eversion_t by_n_to_trim = eversion_t::max(); // start from head
+    for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
+      i++;
+      if (i > target && by_n_to_keep == eversion_t()) {
+        by_n_to_keep = rit->version;
+      }
+      if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
+        by_n_to_trim = it->version;
+      }
+      if (by_n_to_keep != eversion_t() &&
+          by_n_to_trim != eversion_t::max()) {
+        break;
+      }
+    }
+
+    if (by_n_to_keep == eversion_t()) {
+      return;
+    }
+
+    pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
+    dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
+    assert(pg_trim_to <= pg_log.get_head());
+  }
+}
+
  PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
                            const PGPool &_pool, spg_t p) :
    PG(o, curmap, _pool, p),
@@ -3318,7 +3375,10 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx)
    assert(op->may_write() || op->may_cache());
  
    // trim log?
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
  
    // verify that we are doing this in order?
    if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
@@ -5804,7 +5864,6 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
          t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
                            op.alloc_hint.expected_write_size,
                           op.alloc_hint.flags);
-        ctx->delta_stats.num_wr++;
          result = 0;
        }
        break;
@@ -9579,7 +9638,10 @@ void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
    dout(20) << __func__ << " " << repop << dendl;
    issue_repop(repop, ctx.get());
    eval_repop(repop);
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
    repop->put();
  }
  
@@ -9721,7 +9783,10 @@ void PrimaryLogPG::submit_log_entries(
        assert(r == 0);
      });
  
-  calc_trim_to();
+  if (hard_limit_pglog())
+    calc_trim_to_aggressive();
+  else
+    calc_trim_to();
  }
  
  void PrimaryLogPG::cancel_log_updates()
diff --git a/ceph/src/osd/PrimaryLogPG.h b/ceph/src/osd/PrimaryLogPG.h

index 3f10cef187526c3ad2c0457989a6daf8f12a82fa..ca1a0b1d6037cf43acd4a68979c4d32400563db4 100644 (file)
--- a/ceph/src/osd/PrimaryLogPG.h
+++ b/ceph/src/osd/PrimaryLogPG.h
@@ -1333,6 +1333,7 @@ protected:
    void apply_and_flush_repops(bool requeue);
  
    void calc_trim_to() override;
+  void calc_trim_to_aggressive() override;
    int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
    int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
  
diff --git a/ceph/src/osd/osd_types.h b/ceph/src/osd/osd_types.h

index 58cdfdefbd139ca2c3a95d3073c0cce87518d3ec..5b63b9ed979c0e166e8d9c70dc3c7846bc450ba0 100644 (file)
--- a/ceph/src/osd/osd_types.h
+++ b/ceph/src/osd/osd_types.h
@@ -417,10 +417,14 @@ struct pg_t {
    unsigned get_split_bits(unsigned pg_num) const;
  
    bool contains(int bits, const ghobject_t& oid) {
-    return oid.match(bits, ps());
+    return
+      (int64_t)m_pool == oid.hobj.get_logical_pool() &&
+      oid.match(bits, ps());
    }
    bool contains(int bits, const hobject_t& oid) {
-    return oid.match(bits, ps());
+    return
+      (int64_t)m_pool == oid.get_logical_pool() &&
+      oid.match(bits, ps());
    }
  
    hobject_t get_hobj_start() const;
@@ -562,7 +566,8 @@ struct spg_t {
      return ghobject_t(
        hobject_t(object_t(name), "", CEPH_NOSNAP,
                 pgid.ps(),
-               hobject_t::POOL_TEMP_START - pgid.pool(), ""),
+               hobject_t::get_temp_pool(pgid.pool()),
+               ""),
        ghobject_t::NO_GEN,
        shard);
    }
diff --git a/ceph/src/osdc/Journaler.cc b/ceph/src/osdc/Journaler.cc

index 8ca0c4b27188eddf8db2856db60ebb1fcb3bade4..0b5663b81fa7daffddc5e4bc5aff1645c9c5d00c 100644 (file)
--- a/ceph/src/osdc/Journaler.cc
+++ b/ceph/src/osdc/Journaler.cc
@@ -532,11 +532,12 @@ void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp)
    // adjust safe_pos
    auto it = pending_safe.find(start);
    assert(it != pending_safe.end());
+  uint64_t min_next_safe_pos = pending_safe.begin()->second;
    pending_safe.erase(it);
    if (pending_safe.empty())
      safe_pos = next_safe_pos;
    else
-    safe_pos = pending_safe.begin()->second;
+    safe_pos = min_next_safe_pos;
  
    ldout(cct, 10) << "_finish_flush safe from " << start
                  << ", pending_safe " << pending_safe
@@ -1267,6 +1268,13 @@ bool Journaler::try_read_entry(bufferlist& bl)
  
    // prefetch?
    _prefetch();
+
+  // If bufferlist consists of discontiguous memory, decoding types whose
+  // denc_traits needs contiguous memory is inefficient. The bufferlist may
+  // get copied to temporary memory multiple times (copy_shallow() in
+  // src/include/denc.h actually does deep copy)
+  if (bl.get_num_buffers() > 1)
+    bl.rebuild();
    return true;
  }
  
diff --git a/ceph/src/osdc/ObjectCacher.cc b/ceph/src/osdc/ObjectCacher.cc

index 26705ce1a15fbbc54d7e37e6255f6711a1e7272a..0f0272d1e6a628fb73233bcf00107597ca31a274 100644 (file)
--- a/ceph/src/osdc/ObjectCacher.cc
+++ b/ceph/src/osdc/ObjectCacher.cc
@@ -12,7 +12,7 @@
  #include "include/assert.h"
  
  #define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
-#define BUFFER_MEMORY_WEIGHT 12   // memory usage of BufferHead, count in (1<<n)
+#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT  // memory usage of BufferHead, count in (1<<n)
  
  using std::chrono::seconds;
                                  /// while holding the lock
@@ -230,6 +230,20 @@ void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
    ++p;
    if (p != data.end() && can_merge_bh(bh, p->second))
      merge_left(bh, p->second);
+
+  maybe_rebuild_buffer(bh);
+}
+
+void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh)
+{
+  auto& bl = bh->bl;
+  if (bl.get_num_buffers() <= 1)
+    return;
+
+  auto wasted = bl.get_wasted_space();
+  if (wasted * 2 > bl.length() &&
+      wasted > (1U << BUFFER_MEMORY_WEIGHT))
+    bl.rebuild();
  }
  
  /*
@@ -454,15 +468,18 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
          if (cur + max >= bh->end()) {
            // we want right bit (one splice)
            final = split(bh, cur);   // just split it, take right half.
+          maybe_rebuild_buffer(bh);
            replace_journal_tid(final, tid);
            ++p;
            assert(p->second == final);
          } else {
            // we want middle bit (two splices)
            final = split(bh, cur);
+          maybe_rebuild_buffer(bh);
            ++p;
            assert(p->second == final);
-          split(final, cur+max);
+          auto right = split(final, cur+max);
+          maybe_rebuild_buffer(right);
            replace_journal_tid(final, tid);
          }
        } else {
@@ -471,7 +488,8 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
            // whole bufferhead, piece of cake.
          } else {
            // we want left bit (one splice)
-          split(bh, cur + max);        // just split
+          auto right = split(bh, cur + max);        // just split
+          maybe_rebuild_buffer(right);
          }
          if (final) {
            oc->mark_dirty(bh);
@@ -549,6 +567,7 @@ void ObjectCacher::Object::truncate(loff_t s)
      // split bh at truncation point?
      if (bh->start() < s) {
        split(bh, s);
+      maybe_rebuild_buffer(bh);
        continue;
      }
  
@@ -586,13 +605,15 @@ void ObjectCacher::Object::discard(loff_t off, loff_t len,
      // split bh at truncation point?
      if (bh->start() < off) {
        split(bh, off);
+      maybe_rebuild_buffer(bh);
        ++p;
        continue;
      }
  
      assert(bh->start() >= off);
      if (bh->end() > off + len) {
-      split(bh, off + len);
+      auto right = split(bh, off + len);
+      maybe_rebuild_buffer(right);
      }
  
      ++p;
diff --git a/ceph/src/osdc/ObjectCacher.h b/ceph/src/osdc/ObjectCacher.h

index 60f049ef55d5b2a4bf142b570a33fd728b4e7f0b..81abb214c07379c3fa4eb50c0fd7bfc138a1b893 100644 (file)
--- a/ceph/src/osdc/ObjectCacher.h
+++ b/ceph/src/osdc/ObjectCacher.h
@@ -343,6 +343,7 @@ class ObjectCacher {
      void merge_left(BufferHead *left, BufferHead *right);
      bool can_merge_bh(BufferHead *left, BufferHead *right);
      void try_merge_bh(BufferHead *bh);
+    void maybe_rebuild_buffer(BufferHead *bh);
  
      bool is_cached(loff_t off, loff_t len) const;
      bool include_all_cached_data(loff_t off, loff_t len);
diff --git a/ceph/src/osdc/Objecter.cc b/ceph/src/osdc/Objecter.cc

index c93370ac249f4670d66c198a463bcb78b3796e7d..0989a4c1494f38a03d1d9a5b1498ea77a08aaf46 100644 (file)
--- a/ceph/src/osdc/Objecter.cc
+++ b/ceph/src/osdc/Objecter.cc
@@ -413,7 +413,9 @@ void Objecter::shutdown()
  
    initialized = false;
  
+  wl.unlock();
    cct->_conf->remove_observer(this);
+  wl.lock();
  
    map<int,OSDSession*>::iterator p;
    while (!osd_sessions.empty()) {
@@ -2873,10 +2875,11 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
    }
  
    bool unpaused = false;
-  if (t->paused && !target_should_be_paused(t)) {
-    t->paused = false;
+  bool should_be_paused = target_should_be_paused(t);
+  if (t->paused && !should_be_paused) {
      unpaused = true;
    }
+  t->paused = should_be_paused;
  
    bool legacy_change =
      t->pgid != pgid ||
@@ -3439,7 +3442,9 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
      op->tid = 0;
      m->get_redirect().combine_with_locator(op->target.target_oloc,
                                            op->target.target_oid.name);
-    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED | CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED |
+                        CEPH_OSD_FLAG_IGNORE_CACHE |
+                        CEPH_OSD_FLAG_IGNORE_OVERLAY);
      _op_submit(op, sul, NULL);
      m->put();
      return;
diff --git a/ceph/src/pybind/ceph_volume_client.py b/ceph/src/pybind/ceph_volume_client.py

index 87cba46d1cf76a36ae486adf9c5fb9910dd7bb17..73406d837076c6244b31e0dd4716b462e83c8c0c 100644 (file)
--- a/ceph/src/pybind/ceph_volume_client.py
+++ b/ceph/src/pybind/ceph_volume_client.py
@@ -571,14 +571,14 @@ class CephFSVolumeClient(object):
          else:
              return pool_id
  
-    def create_group(self, group_id):
+    def create_group(self, group_id, mode=0o755):
          # Prevent craftily-named volume groups from colliding with the meta
          # files.
          if group_id.endswith(META_FILE_EXT):
              raise ValueError("group ID cannot end with '{0}'.".format(
                  META_FILE_EXT))
          path = self._get_group_path(group_id)
-        self._mkdir_p(path)
+        self._mkdir_p(path, mode)
  
      def destroy_group(self, group_id):
          path = self._get_group_path(group_id)
@@ -589,7 +589,7 @@ class CephFSVolumeClient(object):
          else:
              self.fs.rmdir(path)
  
-    def _mkdir_p(self, path):
+    def _mkdir_p(self, path, mode=0o755):
          try:
              self.fs.stat(path)
          except cephfs.ObjectNotFound:
@@ -604,9 +604,10 @@ class CephFSVolumeClient(object):
              try:
                  self.fs.stat(subpath)
              except cephfs.ObjectNotFound:
-                self.fs.mkdir(subpath, 0o755)
+                self.fs.mkdir(subpath, mode)
  
-    def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True):
+    def create_volume(self, volume_path, size=None, data_isolated=False, namespace_isolated=True,
+                      mode=0o755):
          """
          Set up metadata, pools and auth for a volume.
  
@@ -622,7 +623,7 @@ class CephFSVolumeClient(object):
          path = self._get_path(volume_path)
          log.info("create_volume: {0}".format(path))
  
-        self._mkdir_p(path)
+        self._mkdir_p(path, mode)
  
          if size is not None:
              self.fs.setxattr(path, 'ceph.quota.max_bytes', to_bytes(size), 0)
@@ -1362,9 +1363,9 @@ class CephFSVolumeClient(object):
              dir_path, self.rados.conf_get('client_snapdir'), snapshot_name
          )
  
-    def _snapshot_create(self, dir_path, snapshot_name):
+    def _snapshot_create(self, dir_path, snapshot_name, mode=0o755):
          # TODO: raise intelligible exception for clusters where snaps are disabled
-        self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), 0o755)
+        self.fs.mkdir(self._snapshot_path(dir_path, snapshot_name), mode)
  
      def _snapshot_destroy(self, dir_path, snapshot_name):
          """
@@ -1375,17 +1376,18 @@ class CephFSVolumeClient(object):
          except cephfs.ObjectNotFound:
              log.warn("Snapshot was already gone: {0}".format(snapshot_name))
  
-    def create_snapshot_volume(self, volume_path, snapshot_name):
-        self._snapshot_create(self._get_path(volume_path), snapshot_name)
+    def create_snapshot_volume(self, volume_path, snapshot_name, mode=0o755):
+        self._snapshot_create(self._get_path(volume_path), snapshot_name, mode)
  
      def destroy_snapshot_volume(self, volume_path, snapshot_name):
          self._snapshot_destroy(self._get_path(volume_path), snapshot_name)
  
-    def create_snapshot_group(self, group_id, snapshot_name):
+    def create_snapshot_group(self, group_id, snapshot_name, mode=0o755):
          if group_id is None:
              raise RuntimeError("Group ID may not be None")
  
-        return self._snapshot_create(self._get_group_path(group_id), snapshot_name)
+        return self._snapshot_create(self._get_group_path(group_id), snapshot_name,
+                                     mode)
  
      def destroy_snapshot_group(self, group_id, snapshot_name):
          if group_id is None:
diff --git a/ceph/src/pybind/mgr/balancer/module.py b/ceph/src/pybind/mgr/balancer/module.py

index 529ee012d6124b9f1333982ad2e8ed1b49fe2423..0f966aa2dc147faa7ab8e0635e2f60103ef7a278 100644 (file)
--- a/ceph/src/pybind/mgr/balancer/module.py
+++ b/ceph/src/pybind/mgr/balancer/module.py
@@ -258,6 +258,11 @@ class Module(MgrModule):
              "desc": "Show an optimization plan",
              "perm": "r",
          },
+        {
+            "cmd": "balancer ls",
+            "desc": "List all plans",
+            "perm": "r",
+        },
          {
              "cmd": "balancer execute name=plan,type=CephString",
              "desc": "Execute an optimization plan",
@@ -277,7 +282,7 @@ class Module(MgrModule):
          self.log.warn("Handling command: '%s'" % str(command))
          if command['prefix'] == 'balancer status':
              s = {
-                'plans': self.plans.keys(),
+                'plans': list(self.plans.keys()),
                  'active': self.active,
                  'mode': self.get_config('mode', default_mode),
              }
@@ -344,6 +349,8 @@ class Module(MgrModule):
          elif command['prefix'] == 'balancer reset':
              self.plans = {}
              return (0, '', '')
+        elif command['prefix'] == 'balancer ls':
+            return (0, json.dumps([p for p in self.plans], indent=4), '')
          elif command['prefix'] == 'balancer dump':
              plan = self.plans.get(command['plan'])
              if not plan:
@@ -608,12 +615,16 @@ class Module(MgrModule):
          }
          self.log.debug('score_by_root %s' % pe.score_by_root)
  
+        # get the list of score metrics, comma separated
+        metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+
          # total score is just average of normalized stddevs
          pe.score = 0.0
          for r, vs in six.iteritems(pe.score_by_root):
              for k, v in six.iteritems(vs):
-                pe.score += v
-        pe.score /= 3 * len(roots)
+                if k in metrics:
+                    pe.score += v
+        pe.score /= len(metrics) * len(roots)
          return pe
  
      def evaluate(self, ms, pools, verbose=False):
@@ -756,7 +767,12 @@ class Module(MgrModule):
              self.log.error(detail)
              return -errno.EOPNOTSUPP, detail
  
-        key = 'pgs'  # pgs objects or bytes
+        # rebalance by pgs, objects, or bytes
+        metrics = self.get_config('crush_compat_metrics', 'pgs,objects,bytes').split(',')
+        key = metrics[0] # balancing using the first score metric
+        if key not in ['pgs', 'bytes', 'objects']:
+            self.log.warn("Invalid crush_compat balancing key %s. Using 'pgs'." % key)
+            key = 'pgs'
  
          # go
          best_ws = copy.deepcopy(orig_ws)
diff --git a/ceph/src/pybind/mgr/influx/module.py b/ceph/src/pybind/mgr/influx/module.py

index 96eb33ce54a6b8fba453e8e6987120ea1ca0699b..b1f369d22f37a8a5010d5d2f6d79d525211efc7f 100644 (file)
--- a/ceph/src/pybind/mgr/influx/module.py
+++ b/ceph/src/pybind/mgr/influx/module.py
@@ -114,6 +114,8 @@ class Module(MgrModule):
          for daemon, counters in six.iteritems(self.get_all_perf_counters()):
              svc_type, svc_id = daemon.split(".", 1)
              metadata = self.get_metadata(svc_type, svc_id)
+            if not metadata:
+                continue
  
              for path, counter_info in counters.items():
                  if counter_info['type'] & self.PERFCOUNTER_HISTOGRAM:
diff --git a/ceph/src/pybind/mgr/prometheus/module.py b/ceph/src/pybind/mgr/prometheus/module.py

index 03dd9962a7ec01a25a8039cb42d346c5f3796af7..fc013fb8591277394b308368b8c13824d01dbc9f 100644 (file)
--- a/ceph/src/pybind/mgr/prometheus/module.py
+++ b/ceph/src/pybind/mgr/prometheus/module.py
@@ -102,7 +102,7 @@ POOL_METADATA = ('pool_id', 'name')
  
  RGW_METADATA = ('ceph_daemon', 'hostname', 'ceph_version')
  
-DISK_OCCUPATION = ( 'ceph_daemon', 'device','instance')
+DISK_OCCUPATION = ('ceph_daemon', 'device', 'db_device', 'wal_device', 'instance')
  
  NUM_OBJECTS = ['degraded', 'misplaced', 'unfound']
  
@@ -483,13 +483,24 @@ class Module(MgrModule):
              osd_metadata = self.get_metadata("osd", str(id_))
              if osd_metadata is None:
                  continue
-            dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
-            osd_dev_node = None
-            for dev_key in dev_keys:
-                val = osd_metadata.get(dev_key, None)
-                if val and val != "unknown":
-                    osd_dev_node = val
-                    break
+
+            osd_objectstore = osd_metadata.get('osd_objectstore', None)
+            if osd_objectstore == "filestore":
+            # collect filestore backend device
+                osd_dev_node = osd_metadata.get('backend_filestore_dev_node', None)
+            # collect filestore journal device
+                osd_wal_dev_node = osd_metadata.get('osd_journal', '')
+                osd_db_dev_node = ''
+            elif osd_objectstore == "bluestore":
+            # collect bluestore backend device
+                osd_dev_node = osd_metadata.get('bluestore_bdev_dev_node', None)
+            # collect bluestore wal backend
+                osd_wal_dev_node = osd_metadata.get('bluefs_wal_dev_node', '')
+            # collect bluestore db backend
+                osd_db_dev_node = osd_metadata.get('bluefs_db_dev_node', '')
+            if osd_dev_node and osd_dev_node == "unknown":
+                osd_dev_node = None
+
              osd_hostname = osd_metadata.get('hostname', None)
              if osd_dev_node and osd_hostname:
                  self.log.debug("Got dev for osd {0}: {1}/{2}".format(
@@ -497,6 +508,8 @@ class Module(MgrModule):
                  self.metrics['disk_occupation'].set(1, (
                      "osd.{0}".format(id_),
                      osd_dev_node,
+                    osd_db_dev_node,
+                    osd_wal_dev_node,
                      osd_hostname
                  ))
              else:
diff --git a/ceph/src/pybind/mgr/restful/common.py b/ceph/src/pybind/mgr/restful/common.py

index 83e32fb60a0c6dc48feeb531ef366ca25bb27e1e..847156780aa0ea5ec2b37807f647b081be071fc9 100644 (file)
--- a/ceph/src/pybind/mgr/restful/common.py
+++ b/ceph/src/pybind/mgr/restful/common.py
@@ -6,7 +6,7 @@ OSD_FLAGS = [
  
  # Implemented osd commands
  OSD_IMPLEMENTED_COMMANDS = [
-    'scrub', 'deep_scrub', 'repair'
+    'scrub', 'deep-scrub', 'repair'
  ]
  
  # Valid values for the 'var' argument to 'ceph osd pool set'
diff --git a/ceph/src/pybind/mgr/restful/module.py b/ceph/src/pybind/mgr/restful/module.py

index 5832081d19361f6139283e4578f67f0de5df0543..197df6ec51dafd7cc82806f40e07c646deb38301 100644 (file)
--- a/ceph/src/pybind/mgr/restful/module.py
+++ b/ceph/src/pybind/mgr/restful/module.py
@@ -362,7 +362,9 @@ class Module(MgrModule):
              if tag == 'seq':
                  return
  
-            request = [x for x in self.requests if x.is_running(tag)]
+            with self.requests_lock:
+                request = [x for x in self.requests if x.is_running(tag)]
+
              if len(request) != 1:
                  self.log.warn("Unknown request '%s'" % str(tag))
                  return
@@ -581,8 +583,8 @@ class Module(MgrModule):
  
  
      def submit_request(self, _request, **kwargs):
-        request = CommandsRequest(_request)
          with self.requests_lock:
+            request = CommandsRequest(_request)
              self.requests.append(request)
          if kwargs.get('wait', 0):
              while not request.is_finished():
diff --git a/ceph/src/pybind/mgr/status/module.py b/ceph/src/pybind/mgr/status/module.py

index 12eddf516b350f60c3bf726372059a390d6beff6..acedbff906ab7f2906e2b7b41d61cbd1779e12cb 100644 (file)
--- a/ceph/src/pybind/mgr/status/module.py
+++ b/ceph/src/pybind/mgr/status/module.py
@@ -196,7 +196,7 @@ class Module(MgrModule):
                  dns = self.get_latest("mds", daemon_info['name'], "mds_mem.dn")
  
                  activity = "Evts: " + self.format_dimless(
-                    self.get_rate("mds", daemon_info['name'], "mds_log.replay"),
+                    self.get_rate("mds", daemon_info['name'], "mds_log.replayed"),
                      5
                  ) + "/s"
  
@@ -240,7 +240,7 @@ class Module(MgrModule):
          output += "\n" + standby_table.get_string() + "\n"
  
          if len(mds_versions) == 1:
-            output += "MDS version: {0}".format(mds_versions.keys()[0])
+            output += "MDS version: {0}".format(list(mds_versions)[0])
          else:
              version_table = PrettyTable(["version", "daemons"])
              for version, daemons in six.iteritems(mds_versions):
diff --git a/ceph/src/pybind/rbd/rbd.pyx b/ceph/src/pybind/rbd/rbd.pyx

index 4738b3c197415cbdb919a7eca7bbb4747d3a04b0..0013ae27b780516f01e09d18f9c7b4883a3028cf 100644 (file)
--- a/ceph/src/pybind/rbd/rbd.pyx
+++ b/ceph/src/pybind/rbd/rbd.pyx
@@ -356,6 +356,7 @@ RBD_FEATURES_SINGLE_CLIENT = _RBD_FEATURES_SINGLE_CLIENT
  RBD_FEATURES_ALL = _RBD_FEATURES_ALL
  
  RBD_FLAG_OBJECT_MAP_INVALID = _RBD_FLAG_OBJECT_MAP_INVALID
+RBD_FLAG_FAST_DIFF_INVALID = _RBD_FLAG_FAST_DIFF_INVALID
  
  RBD_MIRROR_MODE_DISABLED = _RBD_MIRROR_MODE_DISABLED
  RBD_MIRROR_MODE_IMAGE = _RBD_MIRROR_MODE_IMAGE
diff --git a/ceph/src/rgw/CMakeLists.txt b/ceph/src/rgw/CMakeLists.txt

index ff2d2d8a12045deb7b263c3b44c90b0eb95b9904..57cb2a5b94ce90590cdc35efe8ebd3505096fce5 100644 (file)
--- a/ceph/src/rgw/CMakeLists.txt
+++ b/ceph/src/rgw/CMakeLists.txt
@@ -142,11 +142,7 @@ add_library(rgw_a STATIC ${rgw_a_srcs})
  
  add_dependencies(rgw_a civetweb_h)
  
-target_include_directories(rgw_a PUBLIC
-  ${FCGI_INCLUDE_DIR}
-  "../rapidjson/include"
-  )
-target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+target_include_directories(rgw_a SYSTEM PUBLIC "../rapidjson/include")
  
  target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount_client
    cls_log_client cls_statelog_client cls_timeindex_client cls_version_client
@@ -155,9 +151,13 @@ target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount
    ${EXPAT_LIBRARIES}
    ${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS})
  
-if (WITH_CURL_OPENSSL)
+if (WITH_CURL_OPENSSL OR (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL))
    target_link_libraries(rgw_a ${OPENSSL_LIBRARIES})
-endif (WITH_CURL_OPENSSL)
+endif()
+if (WITH_RADOSGW_BEAST_FRONTEND)
+  target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
+  target_link_libraries(rgw_a Boost::coroutine Boost::context)
+endif()
  
  set(radosgw_srcs
    rgw_loadgen_process.cc
@@ -177,7 +177,9 @@ endif (WITH_RADOSGW_BEAST_FRONTEND)
  
  add_library(radosgw_a STATIC ${radosgw_srcs}
    $<TARGET_OBJECTS:civetweb_common_objs>)
-target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+if (WITH_RADOSGW_BEAST_FRONTEND AND WITH_RADOSGW_BEAST_OPENSSL)
+  target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
+endif()
  
  add_executable(radosgw rgw_main.cc)
  target_link_libraries(radosgw radosgw_a librados
@@ -193,6 +195,10 @@ add_dependencies(radosgw cls_rgw cls_lock cls_refcount
    cls_version cls_replica_log cls_user)
  install(TARGETS radosgw DESTINATION bin)
  
+if (WITH_RADOSGW_BEAST_FRONTEND)
+  target_link_libraries(radosgw_a ${OPENSSL_LIBRARIES})
+endif()
+
  set(radosgw_admin_srcs
    rgw_admin.cc
    rgw_orphan.cc)
diff --git a/ceph/src/rgw/librgw.cc b/ceph/src/rgw/librgw.cc

index 4d7e32aaf4bbde25763e6300dc3593302e90868e..f6b78c42df27ca33c760873237be3a43e50a366a 100644 (file)
--- a/ceph/src/rgw/librgw.cc
+++ b/ceph/src/rgw/librgw.cc
@@ -49,6 +49,8 @@
  #include "rgw_auth_s3.h"
  #include "rgw_lib.h"
  #include "rgw_lib_frontend.h"
+#include "rgw_http_client.h"
+#include "rgw_http_client_curl.h"
  
  #include <errno.h>
  #include <chrono>
@@ -119,6 +121,7 @@ namespace rgw {
         RGWLibFS* fs = iter->first->ref();
         uniq.unlock();
         fs->gc();
+       fs->update_user();
         fs->rele();
         uniq.lock();
         if (cur_gen != gen)
@@ -489,6 +492,7 @@ namespace rgw {
      rgw_tools_init(g_ceph_context);
  
      rgw_init_resolver();
+    rgw::curl::setup_curl(boost::none);
  
      store = RGWStoreManager::get_storage(g_ceph_context,
                                          g_conf->rgw_enable_gc_threads,
@@ -598,6 +602,7 @@ namespace rgw {
  
      rgw_tools_cleanup();
      rgw_shutdown_resolver();
+    rgw::curl::cleanup_curl();
  
      rgw_perf_stop(g_ceph_context);
  
diff --git a/ceph/src/rgw/rgw_admin.cc b/ceph/src/rgw/rgw_admin.cc

index 311596b8820adcf192c610269a8d28ee28d7b92f..13ce136ccd1b1a60f062bce0cb6edf06a75b002f 100644 (file)
--- a/ceph/src/rgw/rgw_admin.cc
+++ b/ceph/src/rgw/rgw_admin.cc
@@ -201,6 +201,8 @@ void usage()
    cout << "  reshard list               list all bucket resharding or scheduled to be reshared\n";
    cout << "  reshard process            process of scheduled reshard jobs\n";
    cout << "  reshard cancel             cancel resharding a bucket\n";
+  cout << "  reshard stale-instances list list stale-instances from bucket resharding\n";
+  cout << "  reshard stale-instances rm   cleanup stale-instances from bucket resharding\n";
    cout << "  sync error list            list sync error\n";
    cout << "  sync error trim            trim sync error\n";
    cout << "options:\n";
@@ -497,6 +499,8 @@ enum {
    OPT_RESHARD_STATUS,
    OPT_RESHARD_PROCESS,
    OPT_RESHARD_CANCEL,
+  OPT_RESHARD_STALE_INSTANCES_LIST,
+  OPT_RESHARD_STALE_INSTANCES_DELETE
  };
  
  static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_cmd, bool *need_more)
@@ -531,6 +535,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
        strcmp(cmd, "replicalog") == 0 ||
        strcmp(cmd, "role") == 0 ||
        strcmp(cmd, "role-policy") == 0 ||
+      strcmp(cmd, "stale-instances") == 0 ||
        strcmp(cmd, "subuser") == 0 ||
        strcmp(cmd, "sync") == 0 ||
        strcmp(cmd, "usage") == 0 ||
@@ -951,6 +956,13 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
        return OPT_RESHARD_PROCESS;
      if (strcmp(cmd, "cancel") == 0)
        return OPT_RESHARD_CANCEL;
+  } else if ((prev_prev_cmd && strcmp(prev_prev_cmd, "reshard") == 0) &&
+            (strcmp(prev_cmd, "stale-instances") == 0)) {
+    if (strcmp(cmd, "list") == 0)
+      return OPT_RESHARD_STALE_INSTANCES_LIST;
+    if (strcmp(cmd, "rm") == 0 ||
+        strcmp(cmd, "delete") == 0)
+      return OPT_RESHARD_STALE_INSTANCES_DELETE;
    }
  
    return -EINVAL;
@@ -1055,6 +1067,22 @@ static void show_roles_info(vector<RGWRole>& roles, Formatter* formatter)
    formatter->flush(cout);
  }
  
+static void show_reshard_status(
+  const list<cls_rgw_bucket_instance_entry>& status, Formatter *formatter)
+{
+  formatter->open_array_section("status");
+  for (const auto& entry : status) {
+    formatter->open_object_section("entry");
+    formatter->dump_string("reshard_status", to_string(entry.reshard_status));
+    formatter->dump_string("new_bucket_instance_id",
+                          entry.new_bucket_instance_id);
+    formatter->dump_unsigned("num_shards", entry.num_shards);
+    formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
  class StoreDestructor {
    RGWRados *store;
  public:
@@ -2458,7 +2486,7 @@ int check_reshard_bucket_params(RGWRados *store,
    int ret = init_bucket(tenant, bucket_name, bucket_id, bucket_info, bucket, &attrs);
    if (ret < 0) {
      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
    }
  
    int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
@@ -2780,7 +2808,7 @@ int main(int argc, const char **argv)
          return EINVAL;
        }
      } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
-      max_size = strict_iecstrtoll(val.c_str(), &err);
+      max_size = strict_iec_cast<long long>(val.c_str(), &err);
        if (!err.empty()) {
          cerr << "ERROR: failed to parse max size: " << err << std::endl;
          return EINVAL;
@@ -5652,7 +5680,7 @@ next:
      for (int i = 0; i < max_shards; i++) {
        RGWRados::BucketShard bs(store);
        int shard_id = (bucket_info.num_shards > 0  ? i : -1);
-      int ret = bs.init(bucket, shard_id);
+      int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
        marker.clear();
  
        if (ret < 0) {
@@ -5713,7 +5741,7 @@ next:
      for (int i = 0; i < max_shards; i++) {
        RGWRados::BucketShard bs(store);
        int shard_id = (bucket_info.num_shards > 0  ? i : -1);
-      int ret = bs.init(bucket, shard_id);
+      int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
        if (ret < 0) {
          cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << shard_id << "): " << cpp_strerror(-ret) << std::endl;
          return -ret;
@@ -5909,7 +5937,7 @@ next:
        return ret;
      }
  
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
  
  #define DEFAULT_RESHARD_MAX_ENTRIES 1000
      if (max_entries < 1) {
@@ -5995,7 +6023,6 @@ next:
      return 0;
    }
  
-
    if (opt_cmd == OPT_RESHARD_STATUS) {
      if (bucket_name.empty()) {
        cerr << "ERROR: bucket not specified" << std::endl;
@@ -6011,16 +6038,16 @@ next:
        return -ret;
      }
  
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
      list<cls_rgw_bucket_instance_entry> status;
      int r = br.get_status(&status);
      if (r < 0) {
-      cerr << "ERROR: could not get resharding status for bucket " << bucket_name << std::endl;
+      cerr << "ERROR: could not get resharding status for bucket " <<
+       bucket_name << std::endl;
        return -r;
      }
  
-    encode_json("status", status, formatter);
-    formatter->flush(cout);
+    show_reshard_status(status, formatter);
    }
  
    if (opt_cmd == OPT_RESHARD_PROCESS) {
@@ -6048,7 +6075,7 @@ next:
        return -ret;
      }
  
-    RGWBucketReshard br(store, bucket_info, attrs);
+    RGWBucketReshard br(store, bucket_info, attrs, nullptr /* no callback */);
      int ret = br.cancel();
      if (ret < 0) {
        if (ret == -EBUSY) {
@@ -7496,5 +7523,19 @@ next:
      }
    }
  
-  return 0;
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_LIST) {
+   ret = RGWBucketAdminOp::list_stale_instances(store, bucket_op,f);
+   if (ret < 0) {
+     cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+ if (opt_cmd == OPT_RESHARD_STALE_INSTANCES_DELETE) {
+   ret = RGWBucketAdminOp::clear_stale_instances(store, bucket_op,f);
+   if (ret < 0) {
+     cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+ return 0;
  }
diff --git a/ceph/src/rgw/rgw_asio_client.cc b/ceph/src/rgw/rgw_asio_client.cc

index 57dbd6b189bf2eb40b312d5b9021dfde9b521dab..4637eed20ebcdf9afde0545e72caece9386b2517 100644 (file)
--- a/ceph/src/rgw/rgw_asio_client.cc
+++ b/ceph/src/rgw/rgw_asio_client.cc
@@ -11,10 +11,13 @@
  
  using namespace rgw::asio;
  
-ClientIO::ClientIO(tcp::socket& socket,
-                   parser_type& parser,
-                   beast::flat_buffer& buffer)
-  : socket(socket), parser(parser), buffer(buffer), txbuf(*this)
+ClientIO::ClientIO(parser_type& parser, bool is_ssl,
+                   const endpoint_type& local_endpoint,
+                   const endpoint_type& remote_endpoint)
+  : parser(parser), is_ssl(is_ssl),
+    local_endpoint(local_endpoint),
+    remote_endpoint(remote_endpoint),
+    txbuf(*this)
  {
  }
  
@@ -74,53 +77,16 @@ int ClientIO::init_env(CephContext *cct)
    env.set("SCRIPT_URI", url.to_string()); /* FIXME */
  
    char port_buf[16];
-  snprintf(port_buf, sizeof(port_buf), "%d", socket.local_endpoint().port());
+  snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port());
    env.set("SERVER_PORT", port_buf);
-  env.set("REMOTE_ADDR", socket.remote_endpoint().address().to_string());
-  // TODO: set SERVER_PORT_SECURE if using ssl
+  if (is_ssl) {
+    env.set("SERVER_PORT_SECURE", port_buf);
+  }
+  env.set("REMOTE_ADDR", remote_endpoint.address().to_string());
    // TODO: set REMOTE_USER if authenticated
    return 0;
  }
  
-size_t ClientIO::write_data(const char* buf, size_t len)
-{
-  boost::system::error_code ec;
-  auto bytes = boost::asio::write(socket, boost::asio::buffer(buf, len), ec);
-  if (ec) {
-    derr << "write_data failed: " << ec.message() << dendl;
-    throw rgw::io::Exception(ec.value(), std::system_category());
-  }
-  /* According to the documentation of boost::asio::write if there is
-   * no error (signalised by ec), then bytes == len. We don't need to
-   * take care of partial writes in such situation. */
-  return bytes;
-}
-
-size_t ClientIO::read_data(char* buf, size_t max)
-{
-  auto& message = parser.get();
-  auto& body_remaining = message.body();
-  body_remaining.data = buf;
-  body_remaining.size = max;
-
-  dout(30) << this << " read_data for " << max << " with "
-      << buffer.size() << " bytes buffered" << dendl;
-
-  while (body_remaining.size && !parser.is_done()) {
-    boost::system::error_code ec;
-    beast::http::read_some(socket, buffer, parser, ec);
-    if (ec == beast::http::error::partial_message ||
-        ec == beast::http::error::need_buffer) {
-      break;
-    }
-    if (ec) {
-      derr << "failed to read body: " << ec.message() << dendl;
-      throw rgw::io::Exception(ec.value(), std::system_category());
-    }
-  }
-  return max - body_remaining.size;
-}
-
  size_t ClientIO::complete_request()
  {
    return 0;
diff --git a/ceph/src/rgw/rgw_asio_client.h b/ceph/src/rgw/rgw_asio_client.h

index eeed4ee8051e28fa9e5dac1a066ec37e2c4f0d50..5a9957b46c6c7254fbc6e76600cfc5b599fca265 100644 (file)
--- a/ceph/src/rgw/rgw_asio_client.h
+++ b/ceph/src/rgw/rgw_asio_client.h
@@ -18,22 +18,22 @@ using parser_type = beast::http::request_parser<beast::http::buffer_body>;
  
  class ClientIO : public io::RestfulClient,
                   public io::BuffererSink {
- private:
-  using tcp = boost::asio::ip::tcp;
-  tcp::socket& socket;
+ protected:
    parser_type& parser;
-  beast::flat_buffer& buffer; //< parse buffer
+ private:
+  const bool is_ssl;
+  using endpoint_type = boost::asio::ip::tcp::endpoint;
+  endpoint_type local_endpoint;
+  endpoint_type remote_endpoint;
  
    RGWEnv env;
  
    rgw::io::StaticOutputBufferer<> txbuf;
  
-  size_t write_data(const char *buf, size_t len) override;
-  size_t read_data(char *buf, size_t max);
-
   public:
-  ClientIO(tcp::socket& socket, parser_type& parser,
-           beast::flat_buffer& buffer);
+  ClientIO(parser_type& parser, bool is_ssl,
+           const endpoint_type& local_endpoint,
+           const endpoint_type& remote_endpoint);
    ~ClientIO() override;
  
    int init_env(CephContext *cct) override;
@@ -46,10 +46,6 @@ class ClientIO : public io::RestfulClient,
    size_t send_content_length(uint64_t len) override;
    size_t complete_header() override;
  
-  size_t recv_body(char* buf, size_t max) override {
-    return read_data(buf, max);
-  }
-
    size_t send_body(const char* buf, size_t len) override {
      return write_data(buf, len);
    }
diff --git a/ceph/src/rgw/rgw_asio_frontend.cc b/ceph/src/rgw/rgw_asio_frontend.cc

index db0ef66ddcf15871d22d8266603fd108af834b5c..e974ae7bf8e0c686298a65bd0dd0b1481b250b67 100644 (file)
--- a/ceph/src/rgw/rgw_asio_frontend.cc
+++ b/ceph/src/rgw/rgw_asio_frontend.cc
@@ -8,12 +8,17 @@
  #include <vector>
  
  #include <boost/asio.hpp>
+#include <boost/asio/spawn.hpp>
  
  #include "common/errno.h"
  
  #include "rgw_asio_client.h"
  #include "rgw_asio_frontend.h"
  
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+#include <boost/asio/ssl.hpp>
+#endif
+
  #define dout_subsys ceph_subsys_rgw
  
  namespace {
@@ -63,145 +68,156 @@ void Pauser::wait()
  
  using tcp = boost::asio::ip::tcp;
  namespace beast = boost::beast;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+namespace ssl = boost::asio::ssl;
+#endif
+
+template <typename Stream>
+class StreamIO : public rgw::asio::ClientIO {
+  Stream& stream;
+  beast::flat_buffer& buffer;
+ public:
+  StreamIO(Stream& stream, rgw::asio::parser_type& parser,
+           beast::flat_buffer& buffer, bool is_ssl,
+           const tcp::endpoint& local_endpoint,
+           const tcp::endpoint& remote_endpoint)
+      : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint),
+        stream(stream), buffer(buffer)
+  {}
+
+  size_t write_data(const char* buf, size_t len) override {
+    boost::system::error_code ec;
+    auto bytes = boost::asio::write(stream, boost::asio::buffer(buf, len), ec);
+    if (ec) {
+      derr << "write_data failed: " << ec.message() << dendl;
+      throw rgw::io::Exception(ec.value(), std::system_category());
+    }
+    return bytes;
+  }
  
-class Connection {
-  RGWProcessEnv& env;
-  boost::asio::io_service::strand strand;
-  tcp::socket socket;
-
-  // references are bound to callbacks for async operations. if a callback
-  // function returns without issuing another operation, the reference is
-  // dropped and the Connection is deleted/closed
-  std::atomic<int> nref{0};
-  using Ref = boost::intrusive_ptr<Connection>;
+  size_t recv_body(char* buf, size_t max) override {
+    auto& message = parser.get();
+    auto& body_remaining = message.body();
+    body_remaining.data = buf;
+    body_remaining.size = max;
+
+    while (body_remaining.size && !parser.is_done()) {
+      boost::system::error_code ec;
+      beast::http::read_some(stream, buffer, parser, ec);
+      if (ec == beast::http::error::partial_message ||
+          ec == beast::http::error::need_buffer) {
+        break;
+      }
+      if (ec) {
+        derr << "failed to read body: " << ec.message() << dendl;
+        throw rgw::io::Exception(ec.value(), std::system_category());
+      }
+    }
+    return max - body_remaining.size;
+  }
+};
  
+template <typename Stream>
+void handle_connection(RGWProcessEnv& env, Stream& stream,
+                       beast::flat_buffer& buffer, bool is_ssl,
+                       boost::system::error_code& ec,
+                       boost::asio::yield_context yield)
+{
    // limit header to 4k, since we read it all into a single flat_buffer
    static constexpr size_t header_limit = 4096;
    // don't impose a limit on the body, since we read it in pieces
    static constexpr size_t body_limit = std::numeric_limits<size_t>::max();
  
-  beast::flat_buffer buffer;
-  boost::optional<rgw::asio::parser_type> parser;
-
-  using bad_response_type = beast::http::response<beast::http::empty_body>;
-  boost::optional<bad_response_type> response;
+  auto cct = env.store->ctx();
  
-  CephContext* ctx() const { return env.store->ctx(); }
-
-  void read_header() {
+  // read messages from the stream until eof
+  for (;;) {
      // configure the parser
-    parser.emplace();
-    parser->header_limit(header_limit);
-    parser->body_limit(body_limit);
+    rgw::asio::parser_type parser;
+    parser.header_limit(header_limit);
+    parser.body_limit(body_limit);
  
      // parse the header
-    beast::http::async_read_header(socket, buffer, *parser, strand.wrap(
-            std::bind(&Connection::on_header, Ref{this},
-                      std::placeholders::_1)));
-  }
-
-  void discard_unread_message() {
-    if (parser->is_done()) {
-      // nothing left to discard, start reading the next message
-      read_header();
-      return;
-    }
-
-    // read the rest of the request into a static buffer. multiple clients could
-    // write at the same time, but this is okay because we never read it back
-    static std::array<char, 1024> discard_buffer;
-
-    auto& body = parser->get().body();
-    body.size = discard_buffer.size();
-    body.data = discard_buffer.data();
-
-    beast::http::async_read_some(socket, buffer, *parser, strand.wrap(
-            std::bind(&Connection::on_discard_unread, Ref{this},
-                      std::placeholders::_1)));
-  }
-
-  void on_discard_unread(boost::system::error_code ec) {
-    if (ec == boost::asio::error::connection_reset) {
-      return;
-    }
-    if (ec) {
-      ldout(ctx(), 5) << "discard_unread_message failed: "
-          << ec.message() << dendl;
-      return;
-    }
-    discard_unread_message();
-  }
-
-  void on_write_error(boost::system::error_code ec) {
-    if (ec) {
-      ldout(ctx(), 5) << "failed to write response: " << ec.message() << dendl;
-    }
-  }
-
-  void on_header(boost::system::error_code ec) {
+    beast::http::async_read_header(stream, buffer, parser, yield[ec]);
      if (ec == boost::asio::error::connection_reset ||
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+        ec == ssl::error::stream_truncated ||
+#endif
          ec == beast::http::error::end_of_stream) {
        return;
      }
      if (ec) {
-      auto& message = parser->get();
-      ldout(ctx(), 1) << "failed to read header: " << ec.message() << dendl;
-      ldout(ctx(), 1) << "====== req done http_status=400 ======" << dendl;
-      response.emplace();
-      response->result(beast::http::status::bad_request);
-      response->version(message.version() == 10 ? 10 : 11);
-      response->prepare_payload();
-      beast::http::async_write(socket, *response, strand.wrap(
-            std::bind(&Connection::on_write_error, Ref{this},
-                      std::placeholders::_1)));
+      ldout(cct, 1) << "failed to read header: " << ec.message() << dendl;
+      auto& message = parser.get();
+      beast::http::response<beast::http::empty_body> response;
+      response.result(beast::http::status::bad_request);
+      response.version(message.version() == 10 ? 10 : 11);
+      response.prepare_payload();
+      beast::http::async_write(stream, response, yield[ec]);
+      if (ec) {
+        ldout(cct, 5) << "failed to write response: " << ec.message() << dendl;
+      }
+      ldout(cct, 1) << "====== req done http_status=400 ======" << dendl;
        return;
      }
  
      // process the request
      RGWRequest req{env.store->get_new_req_id()};
  
-    rgw::asio::ClientIO real_client{socket, *parser, buffer};
+    auto& socket = stream.lowest_layer();
+    StreamIO<Stream> real_client{stream, parser, buffer, is_ssl,
+                                 socket.local_endpoint(),
+                                 socket.remote_endpoint()};
  
      auto real_client_io = rgw::io::add_reordering(
-                            rgw::io::add_buffering(ctx(),
+                            rgw::io::add_buffering(cct,
                                rgw::io::add_chunking(
                                  rgw::io::add_conlen_controlling(
                                    &real_client))));
-    RGWRestfulIO client(ctx(), &real_client_io);
+    RGWRestfulIO client(cct, &real_client_io);
      process_request(env.store, env.rest, &req, env.uri_prefix,
                      *env.auth_registry, &client, env.olog);
  
-    if (parser->keep_alive()) {
-      // parse any unread bytes from the previous message (in case we replied
-      // before reading the entire body) before reading the next
-      discard_unread_message();
+    if (!parser.keep_alive()) {
+      return;
      }
-  }
  
- public:
-  Connection(RGWProcessEnv& env, tcp::socket&& socket)
-    : env(env), strand(socket.get_io_service()), socket(std::move(socket)) {}
-
-  void on_connect() {
-    read_header();
-  }
+    // if we failed before reading the entire message, discard any remaining
+    // bytes before reading the next
+    while (!parser.is_done()) {
+      static std::array<char, 1024> discard_buffer;
  
-  void get() { ++nref; }
-  void put() { if (nref.fetch_sub(1) == 1) { delete this; } }
+      auto& body = parser.get().body();
+      body.size = discard_buffer.size();
+      body.data = discard_buffer.data();
  
-  friend void intrusive_ptr_add_ref(Connection *c) { c->get(); }
-  friend void intrusive_ptr_release(Connection *c) { c->put(); }
-};
+      beast::http::async_read_some(stream, buffer, parser, yield[ec]);
+      if (ec == boost::asio::error::connection_reset) {
+        return;
+      }
+      if (ec) {
+        ldout(cct, 5) << "failed to discard unread message: "
+            << ec.message() << dendl;
+        return;
+      }
+    }
+  }
+}
  
  class AsioFrontend {
    RGWProcessEnv env;
    RGWFrontendConfig* conf;
    boost::asio::io_service service;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  boost::optional<ssl::context> ssl_context;
+  int init_ssl();
+#endif
  
    struct Listener {
      tcp::endpoint endpoint;
      tcp::acceptor acceptor;
      tcp::socket socket;
+    bool use_ssl = false;
  
      Listener(boost::asio::io_service& service)
        : acceptor(service), socket(service) {}
@@ -241,20 +257,46 @@ unsigned short parse_port(const char *input, boost::system::error_code& ec)
  }
  
  tcp::endpoint parse_endpoint(BOOST_ASIO_STRING_VIEW_PARAM input,
+                             unsigned short default_port,
                               boost::system::error_code& ec)
  {
    tcp::endpoint endpoint;
  
-  auto colon = input.find(':');
-  if (colon != input.npos) {
-    auto port_str = input.substr(colon + 1);
-    endpoint.port(parse_port(port_str.data(), ec));
-  } else {
-    endpoint.port(80);
+  if (input.empty()) {
+    ec = boost::asio::error::invalid_argument;
+    return endpoint;
    }
-  if (!ec) {
+
+  if (input[0] == '[') { // ipv6
+    const size_t addr_begin = 1;
+    const size_t addr_end = input.find(']');
+    if (addr_end == input.npos) { // no matching ]
+      ec = boost::asio::error::invalid_argument;
+      return endpoint;
+    }
+    if (addr_end + 1 < input.size()) {
+      // :port must must follow [ipv6]
+      if (input[addr_end + 1] != ':') {
+        ec = boost::asio::error::invalid_argument;
+        return endpoint;
+      } else {
+        auto port_str = input.substr(addr_end + 2);
+        endpoint.port(parse_port(port_str.data(), ec));
+      }
+    }
+    auto addr = input.substr(addr_begin, addr_end - addr_begin);
+    endpoint.address(boost::asio::ip::make_address_v6(addr, ec));
+  } else { // ipv4
+    auto colon = input.find(':');
+    if (colon != input.npos) {
+      auto port_str = input.substr(colon + 1);
+      endpoint.port(parse_port(port_str.data(), ec));
+      if (ec) {
+        return endpoint;
+      }
+    }
      auto addr = input.substr(0, colon);
-    endpoint.address(boost::asio::ip::make_address(addr, ec));
+    endpoint.address(boost::asio::ip::make_address_v4(addr, ec));
    }
    return endpoint;
  }
@@ -287,9 +329,16 @@ int AsioFrontend::init()
    boost::system::error_code ec;
    auto& config = conf->get_config_map();
  
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  int r = init_ssl();
+  if (r < 0) {
+    return r;
+  }
+#endif
+
    // parse endpoints
-  auto range = config.equal_range("port");
-  for (auto i = range.first; i != range.second; ++i) {
+  auto ports = config.equal_range("port");
+  for (auto i = ports.first; i != ports.second; ++i) {
      auto port = parse_port(i->second.c_str(), ec);
      if (ec) {
        lderr(ctx()) << "failed to parse port=" << i->second << dendl;
@@ -299,9 +348,9 @@ int AsioFrontend::init()
      listeners.back().endpoint.port(port);
    }
  
-  range = config.equal_range("endpoint");
-  for (auto i = range.first; i != range.second; ++i) {
-    auto endpoint = parse_endpoint(i->second, ec);
+  auto endpoints = config.equal_range("endpoint");
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    auto endpoint = parse_endpoint(i->second, 80, ec);
      if (ec) {
        lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
        return -ec.value();
@@ -335,6 +384,88 @@ int AsioFrontend::init()
    return drop_privileges(ctx());
  }
  
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+int AsioFrontend::init_ssl()
+{
+  boost::system::error_code ec;
+  auto& config = conf->get_config_map();
+
+  // ssl configuration
+  auto cert = config.find("ssl_certificate");
+  const bool have_cert = cert != config.end();
+  if (have_cert) {
+    // only initialize the ssl context if it's going to be used
+    ssl_context = boost::in_place(ssl::context::tls);
+  }
+
+  auto key = config.find("ssl_private_key");
+  const bool have_private_key = key != config.end();
+  if (have_private_key) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl;
+      return -EINVAL;
+    }
+    ssl_context->use_private_key_file(key->second, ssl::context::pem, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to add ssl_private_key=" << key->second
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+  }
+  if (have_cert) {
+    ssl_context->use_certificate_chain_file(cert->second, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+    if (!have_private_key) {
+      // attempt to use it as a private key if a separate one wasn't provided
+      ssl_context->use_private_key_file(cert->second, ssl::context::pem, ec);
+      if (ec) {
+        lderr(ctx()) << "failed to use ssl_certificate=" << cert->second
+            << " as a private key: " << ec.message() << dendl;
+        return -ec.value();
+      }
+    }
+  }
+
+  // parse ssl endpoints
+  auto ports = config.equal_range("ssl_port");
+  for (auto i = ports.first; i != ports.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl;
+      return -EINVAL;
+    }
+    auto port = parse_port(i->second.c_str(), ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(service);
+    listeners.back().endpoint.port(port);
+    listeners.back().use_ssl = true;
+  }
+
+  auto endpoints = config.equal_range("ssl_endpoint");
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl;
+      return -EINVAL;
+    }
+    auto endpoint = parse_endpoint(i->second, 443, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(service);
+    listeners.back().endpoint = endpoint;
+    listeners.back().use_ssl = true;
+  }
+  return 0;
+}
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+
  void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
  {
    if (!l.acceptor.is_open()) {
@@ -350,9 +481,42 @@ void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
                              accept(l, ec);
                            });
  
-  boost::intrusive_ptr<Connection> conn{new Connection(env, std::move(socket))};
-  conn->on_connect();
-  // reference drops here, but on_connect() takes another
+  // spawn a coroutine to handle the connection
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  if (l.use_ssl) {
+    boost::asio::spawn(service, std::bind(
+      [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+        // wrap the socket in an ssl stream
+        ssl::stream<tcp::socket&> stream{s, *ssl_context};
+        beast::flat_buffer buffer;
+        // do ssl handshake
+        boost::system::error_code ec;
+        auto bytes = stream.async_handshake(ssl::stream_base::server,
+                                            buffer.data(), yield[ec]);
+        if (ec) {
+          ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl;
+          return;
+        }
+        buffer.consume(bytes);
+        handle_connection(env, stream, buffer, true, ec, yield);
+        if (!ec) {
+          // ssl shutdown (ignoring errors)
+          stream.async_shutdown(yield[ec]);
+        }
+        s.shutdown(tcp::socket::shutdown_both, ec);
+      }, std::placeholders::_1, std::move(socket)));
+  } else {
+#else
+  {
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+    boost::asio::spawn(service, std::bind(
+      [this] (boost::asio::yield_context yield, tcp::socket& s) mutable {
+        beast::flat_buffer buffer;
+        boost::system::error_code ec;
+        handle_connection(env, s, buffer, false, ec, yield);
+        s.shutdown(tcp::socket::shutdown_both, ec);
+      }, std::placeholders::_1, std::move(socket)));
+  }
  }
  
  int AsioFrontend::run()
diff --git a/ceph/src/rgw/rgw_auth.cc b/ceph/src/rgw/rgw_auth.cc

index e18749060fef75e2ad8c8f257338c1b2513096d2..1073ba1ed20dd579615185460ae85c03f454f301 100644 (file)
--- a/ceph/src/rgw/rgw_auth.cc
+++ b/ceph/src/rgw/rgw_auth.cc
@@ -5,6 +5,7 @@
  
  #include "rgw_common.h"
  #include "rgw_auth.h"
+#include "rgw_quota.h"
  #include "rgw_user.h"
  #include "rgw_http_client.h"
  #include "rgw_keystone.h"
@@ -444,6 +445,10 @@ void rgw::auth::RemoteApplier::create_account(const rgw_user& acct_user,
    user_info.user_id = new_acct_user;
    user_info.display_name = info.acct_name;
  
+  user_info.max_buckets = cct->_conf->rgw_user_max_buckets;
+  rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
+  rgw_apply_default_user_quota(user_info.user_quota, cct);
+
    int ret = rgw_store_user_info(store, user_info, nullptr, nullptr,
                                  real_time(), true);
    if (ret < 0) {
diff --git a/ceph/src/rgw/rgw_auth_s3.cc b/ceph/src/rgw/rgw_auth_s3.cc

index 0904e825520e8ed49372d78fa513c1985eb2585d..e685705eec0f2edb3dc4f2059e10807dea59d969 100644 (file)
--- a/ceph/src/rgw/rgw_auth_s3.cc
+++ b/ceph/src/rgw/rgw_auth_s3.cc
@@ -659,7 +659,8 @@ get_v4_canon_req_hash(CephContext* cct,
  
    const auto canonical_req_hash = calc_hash_sha256(canonical_req);
  
-  ldout(cct, 10) << "canonical request = " << canonical_req << dendl;
+  using sanitize = rgw::crypt_sanitize::log_content;
+  ldout(cct, 10) << "canonical request = " << sanitize{canonical_req} << dendl;
    ldout(cct, 10) << "canonical request hash = "
                   << buf_to_hex(canonical_req_hash).data() << dendl;
  
diff --git a/ceph/src/rgw/rgw_bucket.cc b/ceph/src/rgw/rgw_bucket.cc

index f8e8c2366a8802f52108d9280ed4ef0dd14e28ab..e4a56d8497a6ec6ee4d9ce7dee92edcb24a4b750 100644 (file)
--- a/ceph/src/rgw/rgw_bucket.cc
+++ b/ceph/src/rgw/rgw_bucket.cc
@@ -13,6 +13,7 @@
  #include "common/errno.h"
  #include "common/ceph_json.h"
  #include "common/backport14.h"
+#include "include/scope_guard.h"
  #include "rgw_rados.h"
  #include "rgw_acl.h"
  #include "rgw_acl_s3.h"
@@ -26,7 +27,7 @@
  #include "include/rados/librados.hpp"
  // until everything is moved from rgw_common
  #include "rgw_common.h"
-
+#include "rgw_reshard.h"
  #include "cls/user/cls_user_types.h"
  
  #define dout_context g_ceph_context
@@ -297,7 +298,8 @@ int rgw_bucket_instance_store_info(RGWRados *store, string& entry, bufferlist& b
    return store->meta_mgr->put_entry(bucket_instance_meta_handler, entry, bl, exclusive, objv_tracker, mtime, pattrs);
  }
  
-int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker) {
+int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+                                     RGWObjVersionTracker *objv_tracker) {
    return store->meta_mgr->remove_entry(bucket_instance_meta_handler, entry, objv_tracker);
  }
  
@@ -1654,6 +1656,213 @@ int RGWBucketAdminOp::set_quota(RGWRados *store, RGWBucketAdminOpState& op_state
    return bucket.set_quota(op_state);
  }
  
+static int purge_bucket_instance(RGWRados *store, const RGWBucketInfo& bucket_info)
+{
+  int max_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+  for (int i = 0; i < max_shards; i++) {
+    RGWRados::BucketShard bs(store);
+    int shard_id = (bucket_info.num_shards > 0  ? i : -1);
+    int ret = bs.init(bucket_info.bucket, shard_id, nullptr);
+    if (ret < 0) {
+      cerr << "ERROR: bs.init(bucket=" << bucket_info.bucket << ", shard=" << shard_id
+           << "): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+    ret = store->bi_remove(bs);
+    if (ret < 0) {
+      cerr << "ERROR: failed to remove bucket index object: "
+           << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+inline auto split_tenant(const std::string& bucket_name) ->
+  std::pair<std::string,std::string>
+{
+  auto p = bucket_name.find('/');
+  if(p != std::string::npos) {
+    return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+  }
+  return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(RGWRados *store, const std::string& bucket_name,
+                         const vector<std::string>& lst,
+                         bucket_instance_ls& stale_instances)
+{
+
+  RGWObjectCtx obj_ctx(store);
+
+  bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+  for (const auto& bucket_instance : lst){
+    RGWBucketInfo binfo;
+    int r = store->get_bucket_instance_info(obj_ctx, bucket_instance,
+                                            binfo, nullptr,nullptr);
+    if (r < 0){
+      // this can only happen if someone deletes us right when we're processing
+      lderr(store->ctx()) << "Bucket instance is invalid: " << bucket_instance
+                          << cpp_strerror(-r) << dendl;
+      continue;
+    }
+    if (binfo.reshard_status == CLS_RGW_RESHARD_DONE)
+      stale_instances.emplace_back(std::move(binfo));
+    else {
+      other_instances.emplace_back(std::move(binfo));
+    }
+  }
+
+  // Read the cur bucket info, if the bucket doesn't exist we can simply return
+  // all the instances
+  std::string tenant,bucket;
+  std::tie(tenant, bucket) = split_tenant(bucket_name);
+  RGWBucketInfo cur_bucket_info;
+  int r = store->get_bucket_info(obj_ctx, tenant, bucket, cur_bucket_info, nullptr);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      // bucket doesn't exist, everything is stale then
+      stale_instances.insert(std::end(stale_instances),
+                             std::make_move_iterator(other_instances.begin()),
+                             std::make_move_iterator(other_instances.end()));
+    } else {
+      // all bets are off if we can't read the bucket, just return the sureshot stale instances
+      lderr(store->ctx()) << "error: reading bucket info for bucket: "
+                          << bucket << cpp_strerror(-r) << dendl;
+    }
+    return;
+  }
+
+  // Don't process further in this round if bucket is resharding
+  if (cur_bucket_info.reshard_status == CLS_RGW_RESHARD_IN_PROGRESS)
+    return;
+
+  other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+                                       [&cur_bucket_info](const RGWBucketInfo& b){
+                                         return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+                                                 b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+                                       }),
+                        other_instances.end());
+
+  // check if there are still instances left
+  if (other_instances.empty()) {
+    return;
+  }
+
+  // Now we have a bucket with instances where the reshard status is none, this
+  // usually happens when the reshard process couldn't complete, lockdown the
+  // bucket and walk through these instances to make sure no one else interferes
+  // with these
+  {
+    RGWBucketReshardLock reshard_lock(store, cur_bucket_info, true);
+    r = reshard_lock.lock();
+    if (r < 0) {
+      // most likely bucket is under reshard, return the sureshot stale instances
+      ldout(store->ctx(), 5) << __func__
+                             << "failed to take reshard lock; reshard underway likey" << dendl;
+      return;
+    }
+    auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+    // this should be fast enough that we may not need to renew locks and check
+    // exit status?, should we read the values of the instances again?
+    stale_instances.insert(std::end(stale_instances),
+                           std::make_move_iterator(other_instances.begin()),
+                           std::make_move_iterator(other_instances.end()));
+  }
+
+  return;
+}
+
+static int process_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+                                   RGWFormatterFlusher& flusher,
+                                   std::function<void(const bucket_instance_ls&,
+                                                      Formatter *,
+                                                      RGWRados*)> process_f)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  int ret = store->meta_mgr->list_keys_init("bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  bool truncated;
+
+  formatter->open_array_section("keys");
+
+  do {
+    list<std::string> keys;
+
+    ret = store->meta_mgr->list_keys_next(handle, default_max_keys, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    } if (ret != -ENOENT) {
+      // partition the list of buckets by buckets as the listing is un sorted,
+      // since it would minimize the reads to bucket_info
+      std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+      for (auto &key: keys) {
+        auto pos = key.find(':');
+        if(pos != std::string::npos)
+          bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+      }
+      for (const auto& kv: bucket_instance_map) {
+        bucket_instance_ls stale_lst;
+        get_stale_instances(store, kv.first, kv.second, stale_lst);
+        process_f(stale_lst, formatter, store);
+      }
+    }
+  } while (truncated);
+
+  formatter->close_section(); // keys
+  formatter->flush(cout);
+  return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(RGWRados *store,
+                                           RGWBucketAdminOpState& op_state,
+                                           RGWFormatterFlusher& flusher)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      RGWRados*){
+                     for (const auto& binfo: lst)
+                       formatter->dump_string("key", binfo.bucket.get_key());
+                   };
+  return process_stale_instances(store, op_state, flusher, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(RGWRados *store,
+                                            RGWBucketAdminOpState& op_state,
+                                            RGWFormatterFlusher& flusher)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      RGWRados *store){
+                     for (const auto &binfo: lst) {
+                       int ret = purge_bucket_instance(store, binfo);
+                       if (ret == 0){
+                         auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+                         ret = store->meta_mgr->remove(md_key);
+                       }
+                       formatter->open_object_section("delete_status");
+                       formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+                       formatter->dump_int("status", -ret);
+                       formatter->close_section();
+                     }
+                   };
+
+  return process_stale_instances(store, op_state, flusher, process_f);
+}
+
  void rgw_data_change::dump(Formatter *f) const
  {
    string type;
@@ -2379,7 +2588,8 @@ public:
      RGWListRawObjsCtx ctx;
    };
  
-  int remove(RGWRados *store, string& entry, RGWObjVersionTracker& objv_tracker) override {
+  int remove(RGWRados *store, string& entry,
+            RGWObjVersionTracker& objv_tracker) override {
      RGWBucketInfo info;
      RGWObjectCtx obj_ctx(store);
  
@@ -2387,7 +2597,8 @@ public:
      if (ret < 0 && ret != -ENOENT)
        return ret;
  
-    return rgw_bucket_instance_remove_entry(store, entry, &info.objv_tracker);
+    return rgw_bucket_instance_remove_entry(store, entry,
+                                           &info.objv_tracker);
    }
  
    void get_pool_and_oid(RGWRados *store, const string& key, rgw_pool& pool, string& oid) override {
diff --git a/ceph/src/rgw/rgw_bucket.h b/ceph/src/rgw/rgw_bucket.h

index f43365515c5b788b5974809ebea323dfd2b2db4a..a574be618203781e43e7162ec665215386f370ba 100644 (file)
--- a/ceph/src/rgw/rgw_bucket.h
+++ b/ceph/src/rgw/rgw_bucket.h
@@ -34,7 +34,8 @@ extern int rgw_bucket_parse_bucket_instance(const string& bucket_instance, strin
  extern int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key,
                                         rgw_bucket* bucket, int *shard_id);
  
-extern int rgw_bucket_instance_remove_entry(RGWRados *store, string& entry, RGWObjVersionTracker *objv_tracker);
+extern int rgw_bucket_instance_remove_entry(RGWRados *store, const string& entry,
+                                           RGWObjVersionTracker *objv_tracker);
  extern void rgw_bucket_instance_key_to_oid(string& key);
  extern void rgw_bucket_instance_oid_to_key(string& oid);
  
@@ -137,7 +138,7 @@ public:
    /**
     * Remove a bucket from the user's list by name.
     */
-  void remove(string& name) {
+  void remove(const string& name) {
      map<string, RGWBucketEnt>::iterator iter;
      iter = buckets.find(name);
      if (iter != buckets.end()) {
@@ -335,6 +336,12 @@ public:
                          RGWFormatterFlusher& flusher,
                          bool warnings_only = false);
    static int set_quota(RGWRados *store, RGWBucketAdminOpState& op_state);
+
+  static int list_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+                                 RGWFormatterFlusher& flusher);
+
+  static int clear_stale_instances(RGWRados *store, RGWBucketAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher);
  };
  
  
diff --git a/ceph/src/rgw/rgw_common.cc b/ceph/src/rgw/rgw_common.cc

index 96007f398568974a68f583813d458245e27290ca..4980d233b39f65108b5c6aab6e892051b6ce841b 100644 (file)
--- a/ceph/src/rgw/rgw_common.cc
+++ b/ceph/src/rgw/rgw_common.cc
@@ -1080,6 +1080,31 @@ string RGWHTTPArgs::sys_get(const string& name, bool * const exists) const
    return e ? iter->second : string();
  }
  
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env)
+{
+  const auto& m = env.get_map();
+  // frontend connected with ssl
+  if (m.count("SERVER_PORT_SECURE")) {
+    return true;
+  }
+  // ignore proxy headers unless explicitly enabled
+  if (!cct->_conf->rgw_trust_forwarded_https) {
+    return false;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
+  // Forwarded: by=<identifier>; for=<identifier>; host=<host>; proto=<http|https>
+  auto i = m.find("HTTP_FORWARDED");
+  if (i != m.end() && i->second.find("proto=https") != std::string::npos) {
+    return true;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
+  i = m.find("HTTP_X_FORWARDED_PROTO");
+  if (i != m.end() && i->second == "https") {
+    return true;
+  }
+  return false;
+}
+
  bool verify_user_permission(struct req_state * const s,
                              RGWAccessControlPolicy * const user_acl,
                              const int perm)
@@ -1870,12 +1895,15 @@ bool match_policy(boost::string_view pattern, boost::string_view input,
  {
    const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ?
        MATCH_CASE_INSENSITIVE : 0;
+  const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE |
+                                    MATCH_POLICY_STRING));
  
    const auto npos = boost::string_view::npos;
    boost::string_view::size_type last_pos_input = 0, last_pos_pattern = 0;
    while (true) {
-    auto cur_pos_input = input.find(":", last_pos_input);
-    auto cur_pos_pattern = pattern.find(":", last_pos_pattern);
+    auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos;
+    auto cur_pos_pattern =
+      colonblocks ? pattern.find(":", last_pos_pattern) : npos;
  
      auto substr_input = input.substr(last_pos_input, cur_pos_input);
      auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern);
diff --git a/ceph/src/rgw/rgw_common.h b/ceph/src/rgw/rgw_common.h

index 04f6d5fc95fe81f8e1061e983a6840e8c649681f..8e927ec9391ba8e33f58fdbdc3fe66f92915ec27 100644 (file)
--- a/ceph/src/rgw/rgw_common.h
+++ b/ceph/src/rgw/rgw_common.h
@@ -420,6 +420,10 @@ public:
    const std::map<string, string, ltstr_nocase>& get_map() const { return env_map; }
  };
  
+// return true if the connection is secure. this either means that the
+// connection arrived via ssl, or was forwarded as https by a trusted proxy
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env);
+
  enum http_op {
    OP_GET,
    OP_PUT,
diff --git a/ceph/src/rgw/rgw_cr_rados.cc b/ceph/src/rgw/rgw_cr_rados.cc

index d79188a8d4460d20f43af013c6eb34b4de36c281..47babe661023983dbbee0908d393e0f96b614bf2 100644 (file)
--- a/ceph/src/rgw/rgw_cr_rados.cc
+++ b/ceph/src/rgw/rgw_cr_rados.cc
@@ -170,7 +170,7 @@ int RGWAsyncLockSystemObj::_send_request()
    utime_t duration(duration_secs, 0);
    l.set_duration(duration);
    l.set_cookie(cookie);
-  l.set_renew(true);
+  l.set_may_renew(true);
  
    return l.lock_exclusive(&ref.ioctx, ref.oid);
  }
diff --git a/ceph/src/rgw/rgw_crypt.cc b/ceph/src/rgw/rgw_crypt.cc

index 81a84ad698af6939832aa81fd9badabc8efbeee4..d5893734ff71ea1e3eae7df6787438b48b823557 100644 (file)
--- a/ceph/src/rgw/rgw_crypt.cc
+++ b/ceph/src/rgw/rgw_crypt.cc
@@ -1038,7 +1038,7 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
          return -ERR_INVALID_ENCRYPTION_ALGORITHM;
        }
        if (s->cct->_conf->rgw_crypt_require_ssl &&
-          !s->info.env->exists("SERVER_PORT_SECURE")) {
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
          ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
          return -ERR_INVALID_REQUEST;
        }
@@ -1144,7 +1144,7 @@ int rgw_s3_prepare_encrypt(struct req_state* s,
          return -EINVAL;
        }
        if (s->cct->_conf->rgw_crypt_require_ssl &&
-          !s->info.env->exists("SERVER_PORT_SECURE")) {
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
          ldout(s->cct, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl;
          return -ERR_INVALID_REQUEST;
        }
@@ -1260,7 +1260,7 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
  
    if (stored_mode == "SSE-C-AES256") {
      if (s->cct->_conf->rgw_crypt_require_ssl &&
-        !s->info.env->exists("SERVER_PORT_SECURE")) {
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
        ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
        return -ERR_INVALID_REQUEST;
      }
@@ -1342,7 +1342,7 @@ int rgw_s3_prepare_decrypt(struct req_state* s,
  
    if (stored_mode == "SSE-KMS") {
      if (s->cct->_conf->rgw_crypt_require_ssl &&
-        !s->info.env->exists("SERVER_PORT_SECURE")) {
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
        ldout(s->cct, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
        return -ERR_INVALID_REQUEST;
      }
diff --git a/ceph/src/rgw/rgw_data_sync.cc b/ceph/src/rgw/rgw_data_sync.cc

index dca9ea9196e1b3f71eb3fbdd8761d75d4ea6db55..ad43157824e592692b67a64680b8eebaee65febe 100644 (file)
--- a/ceph/src/rgw/rgw_data_sync.cc
+++ b/ceph/src/rgw/rgw_data_sync.cc
@@ -2546,6 +2546,10 @@ public:
              }
              logger.log("remove");
              call(data_sync_module->remove_object(sync_env, *bucket_info, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+            // our copy of the object is more recent, continue as if it succeeded
+            if (retcode == -ERR_PRECONDITION_FAILED) {
+              retcode = 0;
+            }
            } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
              logger.log("creating delete marker");
              set_status("creating delete marker");
diff --git a/ceph/src/rgw/rgw_file.h b/ceph/src/rgw/rgw_file.h

index 796dcbf9d37596aed9ee294b6677a0959e10123b..86d647f955685ce1af0686aabecc8dbbcb8f79d8 100644 (file)
--- a/ceph/src/rgw/rgw_file.h
+++ b/ceph/src/rgw/rgw_file.h
@@ -1204,6 +1204,13 @@ namespace rgw {
  
      RGWUserInfo* get_user() { return &user; }
  
+    void update_user() {
+      RGWUserInfo _user = user;
+      int ret = rgw_get_user_info_by_access_key(rgwlib.get_store(), key.id, user);
+      if (ret != 0)
+        user = _user;
+    }
+
      void close();
      void gc();
    }; /* RGWLibFS */
diff --git a/ceph/src/rgw/rgw_iam_policy.cc b/ceph/src/rgw/rgw_iam_policy.cc

index 9f31db2eaad7b73e3feb7592d139abf331170b77..331802144db9d2ab075cfe104800c2d7607917f0 100644 (file)
--- a/ceph/src/rgw/rgw_iam_policy.cc
+++ b/ceph/src/rgw/rgw_iam_policy.cc
@@ -381,7 +381,7 @@ bool ARN::match(const ARN& candidate) const {
      return false;
    }
  
-  if (!match_policy(resource, candidate.resource, MATCH_POLICY_ARN)) {
+  if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) {
      return false;
    }
  
diff --git a/ceph/src/rgw/rgw_metadata.cc b/ceph/src/rgw/rgw_metadata.cc

index 6a275239c5fc744def10b6e8f19614c6135f33b6..30337dcd5b4603b88ff71bd42e70b1e758864fcd 100644 (file)
--- a/ceph/src/rgw/rgw_metadata.cc
+++ b/ceph/src/rgw/rgw_metadata.cc
@@ -1079,7 +1079,9 @@ done:
    return 0;
  }
  
-int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker)
+int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler,
+                                    const string& key,
+                                    RGWObjVersionTracker *objv_tracker)
  {
    string section;
    RGWMetadataLogData log_data;
diff --git a/ceph/src/rgw/rgw_metadata.h b/ceph/src/rgw/rgw_metadata.h

index 0dfc73f112de2ac6b82363cc5d77f23938d5bb0e..5c3205bbc4436ada694a61b4c73d0f3e3b28d69c 100644 (file)
--- a/ceph/src/rgw/rgw_metadata.h
+++ b/ceph/src/rgw/rgw_metadata.h
@@ -346,7 +346,9 @@ public:
  
    int put_entry(RGWMetadataHandler *handler, const string& key, bufferlist& bl, bool exclusive,
                  RGWObjVersionTracker *objv_tracker, real_time mtime, map<string, bufferlist> *pattrs = NULL);
-  int remove_entry(RGWMetadataHandler *handler, string& key, RGWObjVersionTracker *objv_tracker);
+  int remove_entry(RGWMetadataHandler *handler,
+                  const string& key,
+                  RGWObjVersionTracker *objv_tracker);
    int get(string& metadata_key, Formatter *f);
    int put(string& metadata_key, bufferlist& bl,
            RGWMetadataHandler::sync_type_t sync_mode,
diff --git a/ceph/src/rgw/rgw_op.cc b/ceph/src/rgw/rgw_op.cc

index dce4549f6396235477387f29c1ba8f0196838e76..730de87fa31efc40f2b7107667d7d7316fbe9f14 100644 (file)
--- a/ceph/src/rgw/rgw_op.cc
+++ b/ceph/src/rgw/rgw_op.cc
@@ -608,9 +608,7 @@ rgw::IAM::Environment rgw_build_iam_environment(RGWRados* store,
      e.emplace("aws:Referer", i->second);
    }
  
-  // These seem to be the semantics, judging from rest_rgw_s3.cc
-  i = m.find("SERVER_PORT_SECURE");
-  if (i != m.end()) {
+  if (rgw_transport_is_secure(s->cct, *s->info.env)) {
      e.emplace("aws:SecureTransport", "true");
    }
  
@@ -2281,22 +2279,13 @@ int RGWListBucket::verify_permission()
  
  int RGWListBucket::parse_max_keys()
  {
-  if (!max_keys.empty()) {
-    char *endptr;
-    max = strtol(max_keys.c_str(), &endptr, 10);
-    if (endptr) {
-      if (endptr == max_keys.c_str()) return -EINVAL;
-      while (*endptr && isspace(*endptr)) // ignore white space
-        endptr++;
-      if (*endptr) {
-        return -EINVAL;
-      }
-    }
-  } else {
-    max = default_max;
-  }
-
-  return 0;
+  // Bound max value of max-keys to configured value for security
+  // Bound min value of max-keys to '0'
+  // Some S3 clients explicitly send max-keys=0 to detect if the bucket is
+  // empty without listing any items.
+  return parse_value_and_bound(max_keys, max, 0,
+                       s->cct->_conf->get_val<uint64_t>("rgw_max_listing_results"),
+                       default_max);
  }
  
  void RGWListBucket::pre_exec()
diff --git a/ceph/src/rgw/rgw_op.h b/ceph/src/rgw/rgw_op.h

index e4d8cd4a980bc9d952ef3b6b82635adcc2c75572..23d97994723f09ad1c690dce03b8eaf7e365303f 100644 (file)
--- a/ceph/src/rgw/rgw_op.h
+++ b/ceph/src/rgw/rgw_op.h
@@ -2214,6 +2214,36 @@ public:
    virtual const string name() { return "get_cluster_stat"; }
  };
  
+static inline int parse_value_and_bound(
+    const string &input,
+    int &output,
+    const long lower_bound,
+    const long upper_bound,
+    const long default_val)
+{
+  if (!input.empty()) {
+    char *endptr;
+    output = strtol(input.c_str(), &endptr, 10);
+    if (endptr) {
+      if (endptr == input.c_str()) return -EINVAL;
+      while (*endptr && isspace(*endptr)) // ignore white space
+        endptr++;
+      if (*endptr) {
+        return -EINVAL;
+      }
+    }
+    if(output > upper_bound) {
+      output = upper_bound;
+    }
+    if(output < lower_bound) {
+      output = lower_bound;
+    }
+  } else {
+    output = default_val;
+  }
+
+  return 0;
+}
  
  
  #endif /* CEPH_RGW_OP_H */
diff --git a/ceph/src/rgw/rgw_quota.cc b/ceph/src/rgw/rgw_quota.cc

index ce3d1265facd45b8ab2e210511553486e3a7feca..4212e9c8b9852f480193fc6e5c3acb1d14d06b7e 100644 (file)
--- a/ceph/src/rgw/rgw_quota.cc
+++ b/ceph/src/rgw/rgw_quota.cc
@@ -1006,3 +1006,26 @@ void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler)
  }
  
  
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+  if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+    quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+    quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
+
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct)
+{
+  if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+    quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+    quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
diff --git a/ceph/src/rgw/rgw_quota.h b/ceph/src/rgw/rgw_quota.h

index 49ec14871815c1ba43373e87379a349bcd7ac6dd..2f08d4a5d98c74996164c59b130787e1cbdeab7c 100644 (file)
--- a/ceph/src/rgw/rgw_quota.h
+++ b/ceph/src/rgw/rgw_quota.h
@@ -114,4 +114,8 @@ public:
    static void free_handler(RGWQuotaHandler *handler);
  };
  
+// apply default quotas from configuration
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const CephContext* cct);
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const CephContext* cct);
+
  #endif
diff --git a/ceph/src/rgw/rgw_rados.cc b/ceph/src/rgw/rgw_rados.cc

index f123514e470e29f3677cf63d4f394202cabfad7c..ff4ac695954fe38658e0e555b03b67575d767145 100644 (file)
--- a/ceph/src/rgw/rgw_rados.cc
+++ b/ceph/src/rgw/rgw_rados.cc
@@ -1388,6 +1388,12 @@ int RGWPeriod::update()
        return -EINVAL;
      }  
      
+    if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+      ldout(cct,0) << "ERROR: zonegroup " << zg.get_name()
+                   << " has a non existent master zone "<< dendl;
+      return -EINVAL;
+    }
+
      if (zg.is_master_zonegroup()) {
        master_zonegroup = zg.get_id();
        master_zone = zg.master_zone;
@@ -3504,21 +3510,22 @@ int RGWIndexCompletionThread::process()
      ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
  
      RGWRados::BucketShard bs(store);
+    RGWBucketInfo bucket_info;
  
-    int r = bs.init(c->obj.bucket, c->obj);
+    int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
      if (r < 0) {
        ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
        /* not much to do */
        continue;
      }
  
-    r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int { 
-                             librados::ObjectWriteOperation o;
-                             cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
-                             cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
-                                                        c->log_op, c->bilog_op, &c->zones_trace);
-
-                             return bs->index_ctx.operate(bs->bucket_obj, &o);
+    r = store->guard_reshard(&bs, c->obj, bucket_info,
+                            [&](RGWRados::BucketShard *bs) -> int {
+                              librados::ObjectWriteOperation o;
+                              cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+                              cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+                                                         c->log_op, c->bilog_op, &c->zones_trace);
+                              return bs->index_ctx.operate(bs->bucket_obj, &o);
                               });
      if (r < 0) {
        ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
@@ -4258,6 +4265,9 @@ int RGWRados::init_zg_from_period(bool *initialized)
      // use endpoints from the zonegroup's master zone
      auto master = zg.zones.find(zg.master_zone);
      if (master == zg.zones.end()) {
+      // Check for empty zonegroup which can happen if zone was deleted before removal
+      if (zg.zones.size() == 0)
+        continue;
        // fix missing master zone for a single zone zonegroup
        if (zg.master_zone.empty() && zg.zones.size() == 1) {
         master = zg.zones.begin();
@@ -4920,9 +4930,21 @@ int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::
   */
  int RGWRados::list_buckets_init(RGWAccessHandle *handle)
  {
-  librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
-  *handle = (RGWAccessHandle)state;
-  return 0;
+  try {
+    auto iter = root_pool_ctx.nobjects_begin();
+    librados::NObjectIterator *state = new librados::NObjectIterator(iter);
+    *handle = (RGWAccessHandle)state;
+    return 0;
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
  }
  
  /** 
@@ -4945,8 +4967,18 @@ int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *hand
      if (obj.key.name[0] == '_') {
        obj.key.name = obj.key.name.substr(1);
      }
-
-    (*state)++;
+    try {
+      (*state)++;
+    } catch (const std::system_error& e) {
+      int r = -e.code().value();
+      ldout(cct, 10) << "nobjects_begin threw " << e.what()
+         << ", returning " << r << dendl;
+      return r;
+    } catch (const std::exception& e) {
+      ldout(cct, 10) << "nobjects_begin threw " << e.what()
+         << ", returning -5" << dendl;
+      return -EIO;
+    }
    } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
  
    return 0;
@@ -5501,7 +5533,7 @@ int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& durati
    l.set_duration(ut);
    l.set_cookie(owner_id);
    l.set_tag(zone_id);
-  l.set_renew(true);
+  l.set_may_renew(true);
    
    return l.lock_exclusive(&io_ctx, oid);
  }
@@ -5904,9 +5936,9 @@ int RGWRados::create_pool(const rgw_pool& pool)
  
  int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
  {
-  librados::IoCtx index_ctx; // context for new bucket
+  librados::IoCtx index_ctx;
  
-  string dir_oid =  dir_oid_prefix;
+  string dir_oid = dir_oid_prefix;
    int r = open_bucket_index_ctx(bucket_info, index_ctx);
    if (r < 0) {
      return r;
@@ -5917,7 +5949,29 @@ int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
    map<int, string> bucket_objs;
    get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
  
-  return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+  return CLSRGWIssueBucketIndexInit(index_ctx,
+                                   bucket_objs,
+                                   cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
+{
+  librados::IoCtx index_ctx;
+
+  std::string dir_oid = dir_oid_prefix;
+  int r = open_bucket_index_ctx(bucket_info, index_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  dir_oid.append(bucket_info.bucket.bucket_id);
+
+  std::map<int, std::string> bucket_objs;
+  get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
+
+  return CLSRGWIssueBucketIndexClean(index_ctx,
+                                    bucket_objs,
+                                    cct->_conf->rgw_bucket_index_max_aio)();
  }
  
  void RGWRados::create_bucket_id(string *bucket_id)
@@ -6022,16 +6076,16 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
        /* only remove it if it's a different bucket instance */
        if (info.bucket.bucket_id != bucket.bucket_id) {
          /* remove bucket meta instance */
-        string entry = bucket.get_key();
-        r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
+        r = rgw_bucket_instance_remove_entry(this,
+                                            bucket.get_key(),
+                                            &instance_ver);
          if (r < 0)
            return r;
  
-        map<int, string>::const_iterator biter;
-        for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
-          // Do best effort removal
-          index_ctx.remove(biter->second);
-        }
+       /* remove bucket index objects asynchronously by best effort */
+       (void) CLSRGWIssueBucketIndexClean(index_ctx,
+                                          bucket_objs,
+                                          cct->_conf->rgw_bucket_index_max_aio)();
        }
        /* ret == -ENOENT here */
      }
@@ -6688,19 +6742,24 @@ int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key
    return 0;
  }
  
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+                               const rgw_obj& obj,
+                               RGWBucketInfo* bucket_info_out)
  {
    bucket = _bucket;
  
    RGWObjectCtx obj_ctx(store);
  
    RGWBucketInfo bucket_info;
-  int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+  
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
    if (ret < 0) {
      return ret;
    }
  
-  ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
+  ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
    if (ret < 0) {
      ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
      return ret;
@@ -6710,7 +6769,9 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
    return 0;
  }
  
-int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+                               int sid,
+                               RGWBucketInfo* bucket_info_out)
  {
    bucket = _bucket;
    shard_id = sid;
@@ -6718,12 +6779,14 @@ int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
    RGWObjectCtx obj_ctx(store);
  
    RGWBucketInfo bucket_info;
-  int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+  int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
    if (ret < 0) {
      return ret;
    }
  
-  ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
+  ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
    if (ret < 0) {
      ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
      return ret;
@@ -8656,17 +8719,17 @@ int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& ob
    /* if the bucket is not synced we can remove the meta file */
    if (!is_syncing_bucket_meta(bucket)) {
      RGWObjVersionTracker objv_tracker;
-    string entry = bucket.get_key();
-    r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
+    r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
      if (r < 0) {
        return r;
      }
-    /* remove bucket index objects*/
-    map<int, string>::const_iterator biter;
-    for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
-      index_ctx.remove(biter->second);
-    }
+
+   /* remove bucket index objects asynchronously by best effort */
+    (void) CLSRGWIssueBucketIndexClean(index_ctx,
+                                      bucket_objs,
+                                      cct->_conf->rgw_bucket_index_max_aio)();
    }
+
    return 0;
  }
  
@@ -8952,7 +9015,7 @@ int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
    return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
  }
  
-int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
  {
    librados::IoCtx index_ctx;
    map<int, string> bucket_objs;
@@ -10216,7 +10279,7 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio
      }
      ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
      string new_bucket_id;
-    r = store->block_while_resharding(bs, &new_bucket_id);
+    r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info);
      if (r == -ERR_BUSY_RESHARDING) {
        continue;
      }
@@ -10268,9 +10331,9 @@ int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_t
      }
    }
  
-  int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int { 
-    return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
-  });
+  int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
+                                  return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
+                                });
  
    if (r < 0) {
      return r;
@@ -10369,9 +10432,9 @@ int RGWRados::Bucket::UpdateIndex::cancel()
    RGWRados *store = target->get_store();
    BucketShard *bs;
  
-  int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int { 
-    return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
-  });
+  int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
+                                return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
+                              });
  
    /*
     * need to update data log anyhow, so that whoever follows needs to update its internal markers
@@ -11176,14 +11239,17 @@ int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjStat
    return ret;
  }
  
-int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
+int RGWRados::guard_reshard(BucketShard *bs,
+                           const rgw_obj& obj_instance,
+                           const RGWBucketInfo& bucket_info,
+                           std::function<int(BucketShard *)> call)
  {
    rgw_obj obj;
    const rgw_obj *pobj = &obj_instance;
    int r;
  
    for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
-    r = bs->init(pobj->bucket, *pobj);
+    r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
      if (r < 0) {
        ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
        return r;
@@ -11194,7 +11260,7 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f
      }
      ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
      string new_bucket_id;
-    r = block_while_resharding(bs, &new_bucket_id);
+    r = block_while_resharding(bs, &new_bucket_id, bucket_info);
      if (r == -ERR_BUSY_RESHARDING) {
        continue;
      }
@@ -11216,11 +11282,13 @@ int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::f
    return 0;
  }
  
-int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+                                    string *new_bucket_id,
+                                    const RGWBucketInfo& bucket_info)
  {
    std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
  
-  return waiter->block_while_resharding(bs, new_bucket_id);
+  return waiter->block_while_resharding(bs, new_bucket_id, bucket_info);
  }
  
  int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
@@ -11246,13 +11314,14 @@ int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjStat
    BucketShard bs(this);
  
    cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-  r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                    librados::ObjectWriteOperation op;
-                    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                    return cls_rgw_bucket_link_olh(bs->index_ctx, op,
-                                                   bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
-                                                   unmod_since, high_precision_time,
-                                                   get_zone().log_data, zones_trace);
+  r = guard_reshard(&bs, obj_instance, bucket_info,
+                   [&](BucketShard *bs) -> int {
+                     librados::ObjectWriteOperation op;
+                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                     return cls_rgw_bucket_link_olh(bs->index_ctx, op,
+                                                    bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
+                                                    unmod_since, high_precision_time,
+                                                    get_zone().log_data, zones_trace);
                      });
    if (r < 0) {
      ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
@@ -11290,11 +11359,12 @@ int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, con
    BucketShard bs(this);
  
    cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
-  r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                    librados::ObjectWriteOperation op;
-                    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                    return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
-                                                          olh_tag, olh_epoch, get_zone().log_data, zones_trace);
+  r = guard_reshard(&bs, obj_instance, bucket_info,
+                   [&](BucketShard *bs) -> int {
+                     librados::ObjectWriteOperation op;
+                     cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                     return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
+                                                           olh_tag, olh_epoch, get_zone().log_data, zones_trace);
                      });
    if (r < 0) {
      ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
@@ -11316,7 +11386,8 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj
    }
  
    BucketShard bs(this);
-  int ret = bs.init(obj_instance.bucket, obj_instance);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -11326,12 +11397,13 @@ int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObj
  
    cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
  
-  ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int { 
-                      ObjectReadOperation op;
-                      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                      return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
-                                                 key, ver_marker, olh_tag, log, is_truncated);
-                    });
+  ret = guard_reshard(&bs, obj_instance, bucket_info,
+                     [&](BucketShard *bs) -> int {
+                       ObjectReadOperation op;
+                       cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                       return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
+                                                  key, ver_marker, olh_tag, log, is_truncated);
+                     });
    if (ret < 0) {
      ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
      return ret;
@@ -11349,7 +11421,8 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj
    }
  
    BucketShard bs(this);
-  int ret = bs.init(obj_instance.bucket, obj_instance);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -11359,11 +11432,12 @@ int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObj
  
    cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
  
-  ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { 
-                      ObjectWriteOperation op;
-                      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                      cls_rgw_trim_olh_log(op, key, ver, olh_tag);
-                      return pbs->index_ctx.operate(pbs->bucket_obj, &op);
+  ret = guard_reshard(&bs, obj_instance, bucket_info,
+                     [&](BucketShard *pbs) -> int {
+                       ObjectWriteOperation op;
+                       cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                       cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+                       return pbs->index_ctx.operate(pbs->bucket_obj, &op);
                        });
    if (ret < 0) {
      ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
@@ -11387,10 +11461,11 @@ int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjSta
  
    cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
  
-  int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int { 
-                          ObjectWriteOperation op;
-                          cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
-                          return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
+  int ret = guard_reshard(&bs, obj_instance, bucket_info,
+                         [&](BucketShard *pbs) -> int {
+                           ObjectWriteOperation op;
+                           cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+                           return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
                            });
    if (ret < 0) {
      ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
@@ -12591,9 +12666,19 @@ int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGW
      return -EINVAL;
    }
  
-  iter = io_ctx.nobjects_begin(oc);
-
-  return 0;
+  try {
+    iter = io_ctx.nobjects_begin(oc);
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
  }
  
  string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
@@ -12601,7 +12686,8 @@ string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
    return ctx.iter.get_cursor().to_str();
  }
  
-int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+                           vector<rgw_bucket_dir_entry>& objs,
                             bool *is_truncated, RGWAccessListFilter *filter)
  {
    librados::IoCtx& io_ctx = ctx.io_ctx;
@@ -12640,6 +12726,24 @@ struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
    }
  };
  
+int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  // catch exceptions from NObjectIterator::operator++()
+  try {
+    return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
  int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
  {
    if (!ctx->initialized) {
@@ -12876,7 +12980,7 @@ int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rg
  int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
  {
    BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -12908,7 +13012,7 @@ int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
  int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
  {
    BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -12921,7 +13025,7 @@ int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string&
  {
    rgw_obj obj(bucket, obj_name);
    BucketShard bs(this);
-  int ret = bs.init(bucket, obj);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -12963,7 +13067,7 @@ int RGWRados::bi_remove(BucketShard& bs)
  int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
  {
    BucketShard bs(this);
-  int ret = bs.init(bucket, shard_id);
+  int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
    if (ret < 0) {
      ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
      return ret;
@@ -13015,13 +13119,6 @@ bool RGWRados::process_expire_objects()
    return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
  }
  
-int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
-{
-  bufferlist in;
-  cls_rgw_bucket_init(op);
-  return index_ctx.operate(oid, &op);
-}
-
  int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
                                   rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
  {
diff --git a/ceph/src/rgw/rgw_rados.h b/ceph/src/rgw/rgw_rados.h

index 9b4d052c3e77699097182fca4fe3e55548935572..cd462fe20893f865ac511912508ee899a794aae4 100644 (file)
--- a/ceph/src/rgw/rgw_rados.h
+++ b/ceph/src/rgw/rgw_rados.h
@@ -2229,6 +2229,7 @@ class RGWRados : public AdminSocketHook
    friend class RGWReplicaLogger;
    friend class RGWReshard;
    friend class RGWBucketReshard;
+  friend class RGWBucketReshardLock;
    friend class BucketIndexLockGuard;
    friend class RGWCompleteMultipart;
  
@@ -2611,6 +2612,7 @@ public:
    int create_pool(const rgw_pool& pool);
  
    int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
+  int clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
    int select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
                                string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
    int select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info);
@@ -2706,8 +2708,8 @@ public:
      string bucket_obj;
  
      explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
-    int init(const rgw_bucket& _bucket, const rgw_obj& obj);
-    int init(const rgw_bucket& _bucket, int sid);
+    int init(const rgw_bucket& _bucket, const rgw_obj& obj, RGWBucketInfo* out);
+    int init(const rgw_bucket& _bucket, int sid, RGWBucketInfo* out);
      int init(const RGWBucketInfo& bucket_info, int sid);
    };
  
@@ -2747,7 +2749,8 @@ public:
  
      int get_bucket_shard(BucketShard **pbs) {
        if (!bs_initialized) {
-        int r = bs.init(bucket_info.bucket, obj);
+        int r =
+         bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */);
          if (r < 0) {
            return r;
          }
@@ -2943,7 +2946,8 @@ public:
        rgw_zone_set *zones_trace{nullptr};
  
        int init_bs() {
-        int r = bs.init(target->get_bucket(), obj);
+        int r =
+         bs.init(target->get_bucket(), obj, nullptr /* no RGWBucketInfo */);
          if (r < 0) {
            return r;
          }
@@ -3350,8 +3354,13 @@ public:
    int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
    int obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
  
-  int guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call);
-  int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+  int guard_reshard(BucketShard *bs,
+                   const rgw_obj& obj_instance,
+                   const RGWBucketInfo& bucket_info,
+                   std::function<int(BucketShard *)> call);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+                            string *new_bucket_id,
+                            const RGWBucketInfo& bucket_info);
  
    void bucket_index_guard_olh_op(RGWObjState& olh_state, librados::ObjectOperation& op);
    int olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag);
@@ -3511,7 +3520,6 @@ public:
    int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
                              map<string, bufferlist> *pattrs, bool create_entry_point);
  
-  int cls_rgw_init_index(librados::IoCtx& io_ctx, librados::ObjectWriteOperation& op, string& oid);
    int cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
    int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag, int64_t pool, uint64_t epoch,
                            rgw_bucket_dir_entry& ent, RGWObjCategory category, list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
@@ -3623,7 +3631,7 @@ public:
                           map<RGWObjCategory, RGWStorageStats> *existing_stats,
                           map<RGWObjCategory, RGWStorageStats> *calculated_stats);
    int bucket_rebuild_index(RGWBucketInfo& bucket_info);
-  int bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+  int bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
    int remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list);
    int move_rados_obj(librados::IoCtx& src_ioctx,
                      const string& src_oid, const string& src_locator,
diff --git a/ceph/src/rgw/rgw_reshard.cc b/ceph/src/rgw/rgw_reshard.cc

index 7b311116338d1f0bdf6cdc5a477922c40b2a1bbe..a7c041f68ba3d491c3cc66be29eb13fd6123de9f 100644 (file)
--- a/ceph/src/rgw/rgw_reshard.cc
+++ b/ceph/src/rgw/rgw_reshard.cc
@@ -1,6 +1,8 @@
  // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
  // vim: ts=8 sw=2 smarttab
  
+#include <limits>
+
  #include "rgw_rados.h"
  #include "rgw_bucket.h"
  #include "rgw_reshard.h"
@@ -23,6 +25,7 @@ using namespace std;
  #define RESHARD_SHARD_WINDOW 64
  #define RESHARD_MAX_AIO 128
  
+
  class BucketReshardShard {
    RGWRados *store;
    const RGWBucketInfo& bucket_info;
@@ -66,10 +69,12 @@ class BucketReshardShard {
  public:
    BucketReshardShard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
                       int _num_shard,
-                     deque<librados::AioCompletion *>& _completions) : store(_store), bucket_info(_bucket_info), bs(store),
-                                                                       aio_completions(_completions) {
+                     deque<librados::AioCompletion *>& _completions) :
+    store(_store), bucket_info(_bucket_info), bs(store),
+    aio_completions(_completions)
+  {
      num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1);
-    bs.init(bucket_info.bucket, num_shard);
+    bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */);
    }
  
    int get_num_shard() {
@@ -92,8 +97,10 @@ public:
          return ret;
        }
      }
+
      return 0;
    }
+
    int flush() {
      if (entries.size() == 0) {
        return 0;
@@ -130,7 +137,8 @@ public:
      }
      return ret;
    }
-};
+}; // class BucketReshardShard
+
  
  class BucketReshardManager {
    RGWRados *store;
@@ -140,8 +148,12 @@ class BucketReshardManager {
    vector<BucketReshardShard *> target_shards;
  
  public:
-  BucketReshardManager(RGWRados *_store, const RGWBucketInfo& _target_bucket_info, int _num_target_shards) : store(_store), target_bucket_info(_target_bucket_info),
-                                                                                                       num_target_shards(_num_target_shards) {
+  BucketReshardManager(RGWRados *_store,
+                      const RGWBucketInfo& _target_bucket_info,
+                      int _num_target_shards) :
+    store(_store), target_bucket_info(_target_bucket_info),
+    num_target_shards(_num_target_shards)
+  {
      target_shards.resize(num_target_shards);
      for (int i = 0; i < num_target_shards; ++i) {
        target_shards[i] = new BucketReshardShard(store, target_bucket_info, i, completions);
@@ -152,7 +164,8 @@ public:
      for (auto& shard : target_shards) {
        int ret = shard->wait_all_aio();
        if (ret < 0) {
-        ldout(store->ctx(), 20) << __func__ << ": shard->wait_all_aio() returned ret=" << ret << dendl;
+        ldout(store->ctx(), 20) << __func__ <<
+         ": shard->wait_all_aio() returned ret=" << ret << dendl;
        }
      }
    }
@@ -160,11 +173,14 @@ public:
    int add_entry(int shard_index,
                  rgw_cls_bi_entry& entry, bool account, uint8_t category,
                  const rgw_bucket_category_stats& entry_stats) {
-    int ret = target_shards[shard_index]->add_entry(entry, account, category, entry_stats);
+    int ret = target_shards[shard_index]->add_entry(entry, account, category,
+                                                   entry_stats);
      if (ret < 0) {
-      derr << "ERROR: target_shards.add_entry(" << entry.idx << ") returned error: " << cpp_strerror(-ret) << dendl;
+      derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+       ") returned error: " << cpp_strerror(-ret) << dendl;
        return ret;
      }
+
      return 0;
    }
  
@@ -188,43 +204,22 @@ public:
      target_shards.clear();
      return ret;
    }
-};
-
-RGWBucketReshard::RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info, const map<string, bufferlist>& _bucket_attrs) :
-                                                     store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
-                                                     reshard_lock(reshard_lock_name) {
-  const rgw_bucket& b = bucket_info.bucket;
-  reshard_oid = b.tenant + (b.tenant.empty() ? "" : ":") + b.name + ":" + b.bucket_id;
-
-  utime_t lock_duration(store->ctx()->_conf->rgw_reshard_bucket_lock_duration, 0);
-#define COOKIE_LEN 16
-  char cookie_buf[COOKIE_LEN + 1];
-  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
-  cookie_buf[COOKIE_LEN] = '\0';
-
-  reshard_lock.set_cookie(cookie_buf);
-  reshard_lock.set_duration(lock_duration);
-}
-
-int RGWBucketReshard::lock_bucket()
-{
-  int ret = reshard_lock.lock_exclusive(&store->reshard_pool_ctx, reshard_oid);
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "RGWReshard::add failed to acquire lock on " << reshard_oid << " ret=" << ret << dendl;
-    return ret;
-  }
-  return 0;
-}
-
-void RGWBucketReshard::unlock_bucket()
-{
-  int ret = reshard_lock.unlock(&store->reshard_pool_ctx, reshard_oid);
-  if (ret < 0) {
-    ldout(store->ctx(), 0) << "WARNING: RGWReshard::add failed to drop lock on " << reshard_oid << " ret=" << ret << dendl;
-  }
-}
-
-int RGWBucketReshard::set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status)
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(RGWRados *_store,
+                                  const RGWBucketInfo& _bucket_info,
+                                  const map<string, bufferlist>& _bucket_attrs,
+                                  RGWBucketReshardLock* _outer_reshard_lock) :
+  store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+  reshard_lock(store, bucket_info, true),
+  outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+int RGWBucketReshard::set_resharding_status(RGWRados* store,
+                                           const RGWBucketInfo& bucket_info,
+                                           const string& new_instance_id,
+                                           int32_t num_shards,
+                                           cls_rgw_reshard_status status)
  {
    if (new_instance_id.empty()) {
      ldout(store->ctx(), 0) << __func__ << " missing new bucket instance id" << dendl;
@@ -243,16 +238,48 @@ int RGWBucketReshard::set_resharding_status(const string& new_instance_id, int32
    return 0;
  }
  
-int RGWBucketReshard::clear_resharding()
+// reshard lock assumes lock is held
+int RGWBucketReshard::clear_resharding(RGWRados* store,
+                                      const RGWBucketInfo& bucket_info)
  {
-  cls_rgw_bucket_instance_entry instance_entry;
+  int ret = clear_index_shard_reshard_status(store, bucket_info);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+      " ERROR: error clearing reshard status from index shard " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
  
-  int ret = store->bucket_set_reshard(bucket_info, instance_entry);
+  cls_rgw_bucket_instance_entry instance_entry;
+  ret = store->bucket_set_reshard(bucket_info, instance_entry);
    if (ret < 0) {
-    ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
-                 << cpp_strerror(-ret) << dendl;
+    ldout(store->ctx(), 0) << "RGWReshard::" << __func__ <<
+      " ERROR: error setting bucket resharding flag on bucket index: " <<
+      cpp_strerror(-ret) << dendl;
      return ret;
    }
+
+  return 0;
+}
+
+int RGWBucketReshard::clear_index_shard_reshard_status(RGWRados* store,
+                                                      const RGWBucketInfo& bucket_info)
+{
+  uint32_t num_shards = bucket_info.num_shards;
+
+  if (num_shards < std::numeric_limits<uint32_t>::max()) {
+    int ret = set_resharding_status(store, bucket_info,
+                                   bucket_info.bucket.bucket_id,
+                                   (num_shards < 1 ? 1 : num_shards),
+                                   CLS_RGW_RESHARD_NONE);
+    if (ret < 0) {
+      ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
+       " ERROR: error clearing reshard status from index shard " <<
+       cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+
    return 0;
  }
  
@@ -276,13 +303,13 @@ static int create_new_bucket_instance(RGWRados *store,
    int ret = store->init_bucket_index(new_bucket_info, new_bucket_info.num_shards);
    if (ret < 0) {
      cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
    }
  
    ret = store->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs);
    if (ret < 0) {
      cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl;
-    return -ret;
+    return ret;
    }
  
    return 0;
@@ -291,20 +318,21 @@ static int create_new_bucket_instance(RGWRados *store,
  int RGWBucketReshard::create_new_bucket_instance(int new_num_shards,
                                                   RGWBucketInfo& new_bucket_info)
  {
-  return ::create_new_bucket_instance(store, new_num_shards, bucket_info, bucket_attrs, new_bucket_info);
+  return ::create_new_bucket_instance(store, new_num_shards,
+                                     bucket_info, bucket_attrs, new_bucket_info);
  }
  
  int RGWBucketReshard::cancel()
  {
-  int ret = lock_bucket();
+  int ret = reshard_lock.lock();
    if (ret < 0) {
      return ret;
    }
  
    ret = clear_resharding();
  
-  unlock_bucket();
-  return 0;
+  reshard_lock.unlock();
+  return ret;
  }
  
  class BucketInfoReshardUpdate
@@ -326,16 +354,28 @@ class BucketInfoReshardUpdate
    }
  
  public:
-  BucketInfoReshardUpdate(RGWRados *_store, RGWBucketInfo& _bucket_info,
-                          map<string, bufferlist>& _bucket_attrs, const string& new_bucket_id) : store(_store), 
-                                                                                                 bucket_info(_bucket_info),
-                                                                                                 bucket_attrs(_bucket_attrs) {
+  BucketInfoReshardUpdate(RGWRados *_store,
+                         RGWBucketInfo& _bucket_info,
+                          map<string, bufferlist>& _bucket_attrs,
+                         const string& new_bucket_id) :
+    store(_store),
+    bucket_info(_bucket_info),
+    bucket_attrs(_bucket_attrs)
+  {
      bucket_info.new_bucket_instance_id = new_bucket_id;
    }
+
    ~BucketInfoReshardUpdate() {
      if (in_progress) {
+      // resharding must not have ended correctly, clean up
+      int ret =
+       RGWBucketReshard::clear_index_shard_reshard_status(store, bucket_info);
+      if (ret < 0) {
+       lderr(store->ctx()) << "Error: " << __func__ <<
+         " clear_index_shard_status returned " << ret << dendl;
+      }
        bucket_info.new_bucket_instance_id.clear();
-      set_status(CLS_RGW_RESHARD_NONE);
+      set_status(CLS_RGW_RESHARD_NONE); // clears new_bucket_instance as well
      }
    }
  
@@ -358,35 +398,109 @@ public:
    }
  };
  
-int RGWBucketReshard::do_reshard(
-                  int num_shards,
-                  RGWBucketInfo& new_bucket_info,
-                  int max_entries,
-                   bool verbose,
-                   ostream *out,
-                  Formatter *formatter)
+
+RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store,
+                                          const std::string& reshard_lock_oid,
+                                          bool _ephemeral) :
+  store(_store),
+  lock_oid(reshard_lock_oid),
+  ephemeral(_ephemeral),
+  internal_lock(reshard_lock_name)
+{
+  const int lock_dur_secs = store->ctx()->_conf->rgw_reshard_bucket_lock_duration;
+  duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+  char cookie_buf[COOKIE_LEN + 1];
+  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+  cookie_buf[COOKIE_LEN] = '\0';
+
+  internal_lock.set_cookie(cookie_buf);
+  internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock() {
+  internal_lock.set_must_renew(false);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+                                                lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "RGWReshardLock::" << __func__ <<
+      " failed to acquire lock on " << lock_oid << " ret=" << ret << dendl;
+    return ret;
+  }
+  reset_time(Clock::now());
+
+  return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+  int ret = internal_lock.unlock(&store->reshard_pool_ctx, lock_oid);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+      " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+  }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+  internal_lock.set_must_renew(true);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->reshard_pool_ctx,
+                                                lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) { /* expired or already locked by another processor */
+    ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+      lock_oid << " with " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  internal_lock.set_must_renew(false);
+
+  reset_time(now);
+  ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+    lock_oid << dendl;
+
+  return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(int num_shards,
+                                RGWBucketInfo& new_bucket_info,
+                                int max_entries,
+                                bool verbose,
+                                ostream *out,
+                                Formatter *formatter)
  {
    rgw_bucket& bucket = bucket_info.bucket;
  
    int ret = 0;
  
    if (out) {
-    (*out) << "*** NOTICE: operation will not remove old bucket index objects ***" << std::endl;
-    (*out) << "***         these will need to be removed manually             ***" << std::endl;
      (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
      (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
-    (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id << std::endl;
-    (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id << std::endl;
+    (*out) << "old bucket instance id: " << bucket_info.bucket.bucket_id <<
+      std::endl;
+    (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id <<
+      std::endl;
    }
  
-  /* update bucket info  -- in progress*/
+  /* update bucket info -- in progress*/
    list<rgw_cls_bi_entry> entries;
  
    if (max_entries < 0) {
-    ldout(store->ctx(), 0) << __func__ << ": can't reshard, negative max_entries" << dendl;
+    ldout(store->ctx(), 0) << __func__ <<
+      ": can't reshard, negative max_entries" << dendl;
      return -EINVAL;
    }
  
+  // NB: destructor cleans up sharding state if reshard does not
+  // complete successfully
    BucketInfoReshardUpdate bucket_info_updater(store, bucket_info, bucket_attrs, new_bucket_info.bucket.bucket_id);
  
    ret = bucket_info_updater.start();
@@ -411,7 +525,8 @@ int RGWBucketReshard::do_reshard(
      cout << "total entries:";
    }
  
-  int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
+  const int num_source_shards =
+    (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
    string marker;
    for (int i = 0; i < num_source_shards; ++i) {
      bool is_truncated = true;
@@ -421,11 +536,10 @@ int RGWBucketReshard::do_reshard(
        ret = store->bi_list(bucket, i, string(), marker, max_entries, &entries, &is_truncated);
        if (ret < 0 && ret != -ENOENT) {
         derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
-       return -ret;
+       return ret;
        }
  
-      list<rgw_cls_bi_entry>::iterator iter;
-      for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
         rgw_cls_bi_entry& entry = *iter;
         if (verbose) {
           formatter->open_object_section("entry");
@@ -453,22 +567,41 @@ int RGWBucketReshard::do_reshard(
  
         int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
  
-       ret = target_shards_mgr.add_entry(shard_index, entry, account, category, stats);
+       ret = target_shards_mgr.add_entry(shard_index, entry, account,
+                                         category, stats);
         if (ret < 0) {
           return ret;
         }
+
+       Clock::time_point now = Clock::now();
+       if (reshard_lock.should_renew(now)) {
+         // assume outer locks have timespans at least the size of ours, so
+         // can call inside conditional
+         if (outer_reshard_lock) {
+           ret = outer_reshard_lock->renew(now);
+           if (ret < 0) {
+             return ret;
+           }
+         }
+         ret = reshard_lock.renew(now);
+         if (ret < 0) {
+           lderr(store->ctx()) << "Error renewing bucket lock: " << ret << dendl;
+           return ret;
+         }
+       }
+
         if (verbose) {
           formatter->close_section();
           if (out) {
             formatter->flush(*out);
-           formatter->flush(*out);
           }
         } else if (out && !(total_entries % 1000)) {
           (*out) << " " << total_entries;
         }
-      }
+      } // entries loop
      }
    }
+
    if (verbose) {
      formatter->close_section();
      if (out) {
@@ -481,13 +614,13 @@ int RGWBucketReshard::do_reshard(
    ret = target_shards_mgr.finish();
    if (ret < 0) {
      lderr(store->ctx()) << "ERROR: failed to reshard" << dendl;
-    return EIO;
+    return -EIO;
    }
  
    ret = rgw_link_bucket(store, new_bucket_info.owner, new_bucket_info.bucket, bucket_info.creation_time);
    if (ret < 0) {
      lderr(store->ctx()) << "failed to link new bucket instance (bucket_id=" << new_bucket_info.bucket.bucket_id << ": " << cpp_strerror(-ret) << ")" << dendl;
-    return -ret;
+    return ret;
    }
  
    ret = bucket_info_updater.complete();
@@ -495,8 +628,10 @@ int RGWBucketReshard::do_reshard(
      ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl;
      /* don't error out, reshard process succeeded */
    }
+
    return 0;
-}
+  // NB: some error clean-up is done by ~BucketInfoReshardUpdate
+} // RGWBucketReshard::do_reshard
  
  int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
  {
@@ -523,11 +658,14 @@ int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
    return 0;
  }
  
-int RGWBucketReshard::execute(int num_shards, int max_op_entries,
-                              bool verbose, ostream *out, Formatter *formatter, RGWReshard* reshard_log)
  
+int RGWBucketReshard::execute(int num_shards, int max_op_entries,
+                              bool verbose, ostream *out, Formatter *formatter,
+                             RGWReshard* reshard_log)
  {
-  int ret = lock_bucket();
+  Clock::time_point now;
+
+  int ret = reshard_lock.lock();
    if (ret < 0) {
      return ret;
    }
@@ -535,21 +673,23 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries,
    RGWBucketInfo new_bucket_info;
    ret = create_new_bucket_instance(num_shards, new_bucket_info);
    if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    // shard state is uncertain, but this will attempt to remove them anyway
+    goto error_out;
    }
  
    if (reshard_log) {
      ret = reshard_log->update(bucket_info, new_bucket_info);
      if (ret < 0) {
-      unlock_bucket();
-      return ret;
+      goto error_out;
      }
    }
  
-  ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
+  // set resharding status of current bucket_info & shards with
+  // information about planned resharding
+  ret = set_resharding_status(new_bucket_info.bucket.bucket_id,
+                             num_shards, CLS_RGW_RESHARD_IN_PROGRESS);
    if (ret < 0) {
-    unlock_bucket();
+    reshard_lock.unlock();
      return ret;
    }
  
@@ -557,32 +697,77 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries,
                    new_bucket_info,
                    max_op_entries,
                     verbose, out, formatter);
-
    if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    goto error_out;
    }
  
-  ret = set_resharding_status(new_bucket_info.bucket.bucket_id, num_shards, CLS_RGW_RESHARD_DONE);
+  // at this point we've done the main work; we'll make a best-effort
+  // to clean-up but will not indicate any errors encountered
+
+  reshard_lock.unlock();
+
+  // resharding successful, so remove old bucket index shards; use
+  // best effort and don't report out an error; the lock isn't needed
+  // at this point since all we're using a best effor to to remove old
+  // shard objects
+  ret = store->clean_bucket_index(bucket_info, bucket_info.num_shards);
    if (ret < 0) {
-    unlock_bucket();
-    return ret;
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean up old shards; " <<
+      "RGWRados::clean_bucket_index returned " << ret << dendl;
    }
  
-  unlock_bucket();
+  ret = rgw_bucket_instance_remove_entry(store,
+                                        bucket_info.bucket.get_key(),
+                                        nullptr);
+  if (ret < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean old bucket info object \"" <<
+      bucket_info.bucket.get_key() <<
+      "\"created after successful resharding with error " << ret << dendl;
+  }
  
    return 0;
-}
+
+error_out:
+
+  reshard_lock.unlock();
+
+  // since the real problem is the issue that led to this error code
+  // path, we won't touch ret and instead use another variable to
+  // temporarily error codes
+  int ret2 = store->clean_bucket_index(new_bucket_info,
+                                      new_bucket_info.num_shards);
+  if (ret2 < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean up shards from failed incomplete resharding; " <<
+      "RGWRados::clean_bucket_index returned " << ret2 << dendl;
+  }
+
+  ret2 = rgw_bucket_instance_remove_entry(store,
+                                         new_bucket_info.bucket.get_key(),
+                                         nullptr);
+  if (ret2 < 0) {
+    lderr(store->ctx()) << "Error: " << __func__ <<
+      " failed to clean bucket info object \"" <<
+      new_bucket_info.bucket.get_key() <<
+      "\"created during incomplete resharding with error " << ret2 << dendl;
+  }
+
+  return ret;
+} // execute
  
  
  RGWReshard::RGWReshard(RGWRados* _store, bool _verbose, ostream *_out,
-                       Formatter *_formatter) : store(_store), instance_lock(bucket_instance_lock_name),
-                                                verbose(_verbose), out(_out), formatter(_formatter)
+                       Formatter *_formatter) :
+  store(_store), instance_lock(bucket_instance_lock_name),
+  verbose(_verbose), out(_out), formatter(_formatter)
  {
    num_logshards = store->ctx()->_conf->rgw_reshard_num_logs;
  }
  
-string RGWReshard::get_logshard_key(const string& tenant, const string& bucket_name)
+string RGWReshard::get_logshard_key(const string& tenant,
+                                   const string& bucket_name)
  {
    return tenant + ":" + bucket_name;
  }
@@ -732,12 +917,14 @@ int RGWReshardWait::do_wait()
    return 0;
  }
  
-int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
+int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs,
+                                          string *new_bucket_id,
+                                          const RGWBucketInfo& bucket_info)
  {
    int ret = 0;
    cls_rgw_bucket_instance_entry entry;
  
-  for (int i=0; i < num_retries;i++) {
+  for (int i=0; i < num_retries; i++) {
      ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
      if (ret < 0) {
        ldout(store->ctx(), 0) << __func__ << " ERROR: failed to get bucket resharding :"  <<
@@ -749,12 +936,48 @@ int RGWReshardWait::block_while_resharding(RGWRados::BucketShard *bs, string *ne
        return 0;
      }
      ldout(store->ctx(), 20) << "NOTICE: reshard still in progress; " << (i < num_retries - 1 ? "retrying" : "too many retries") << dendl;
-    /* needed to unlock as clear resharding uses the same lock */
  
      if (i == num_retries - 1) {
        break;
      }
  
+    // If bucket is erroneously marked as resharding (e.g., crash or
+    // other error) then fix it. If we can take the bucket reshard
+    // lock then it means no other resharding should be taking place,
+    // and we're free to clear the flags.
+    {
+      // since we expect to do this rarely, we'll do our work in a
+      // block and erase our work after each try
+
+      RGWObjectCtx obj_ctx(bs->store);
+      const rgw_bucket& b = bs->bucket;
+      std::string bucket_id = b.get_key();
+      RGWBucketReshardLock reshard_lock(bs->store, bucket_info, true);
+      ret = reshard_lock.lock();
+      if (ret < 0) {
+       ldout(store->ctx(), 20) << __func__ <<
+         " INFO: failed to take reshard lock for bucket " <<
+         bucket_id << "; expected if resharding underway" << dendl;
+      } else {
+       ldout(store->ctx(), 10) << __func__ <<
+         " INFO: was able to take reshard lock for bucket " <<
+         bucket_id << dendl;
+       ret = RGWBucketReshard::clear_resharding(bs->store, bucket_info);
+       if (ret < 0) {
+         reshard_lock.unlock();
+         ldout(store->ctx(), 0) << __func__ <<
+           " ERROR: failed to clear resharding flags for bucket " <<
+           bucket_id << dendl;
+       } else {
+         reshard_lock.unlock();
+         ldout(store->ctx(), 5) << __func__ <<
+           " INFO: apparently successfully cleared resharding flags for "
+           "bucket " << bucket_id << dendl;
+         continue; // if we apparently succeed immediately test again
+       } // if clear resharding succeeded
+      } // if taking of lock succeeded
+    } // block to encapsulate recovery from incomplete reshard
+
      ret = do_wait();
      if (ret < 0) {
        ldout(store->ctx(), 0) << __func__ << " ERROR: bucket is still resharding, please retry" << dendl;
@@ -771,92 +994,86 @@ int RGWReshard::process_single_logshard(int logshard_num)
    bool truncated = true;
  
    CephContext *cct = store->ctx();
-  int max_entries = 1000;
-  int max_secs = 60;
-
-  rados::cls::lock::Lock l(reshard_lock_name);
-
-  utime_t time(max_secs, 0);
-  l.set_duration(time);
-
-  char cookie_buf[COOKIE_LEN + 1];
-  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
-  cookie_buf[COOKIE_LEN] = '\0';
-
-  l.set_cookie(cookie_buf);
+  constexpr uint32_t max_entries = 1000;
  
    string logshard_oid;
    get_logshard_oid(logshard_num, &logshard_oid);
  
-  int ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
+  RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+  int ret = logshard_lock.lock();
    if (ret == -EBUSY) { /* already locked by another processor */
-    ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
+    ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " <<
+      logshard_oid << dendl;
      return ret;
    }
  
-  utime_t lock_start_time = ceph_clock_now();
-
    do {
      std::list<cls_rgw_reshard_entry> entries;
      ret = list(logshard_num, marker, max_entries, entries, &truncated);
      if (ret < 0) {
-      ldout(cct, 10) << "cannot list all reshards in logshard oid=" << logshard_oid << dendl;
+      ldout(cct, 10) << "cannot list all reshards in logshard oid=" <<
+       logshard_oid << dendl;
        continue;
      }
  
-    for(auto& entry: entries) {
+    for(auto& entry: entries) { // logshard entries
        if(entry.new_instance_id.empty()) {
  
-       ldout(store->ctx(), 20) << __func__ << " resharding " << entry.bucket_name  << dendl;
+       ldout(store->ctx(), 20) << __func__ << " resharding " <<
+         entry.bucket_name  << dendl;
  
         RGWObjectCtx obj_ctx(store);
         rgw_bucket bucket;
         RGWBucketInfo bucket_info;
         map<string, bufferlist> attrs;
  
-       ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name, bucket_info, nullptr,
-                                   &attrs);
+       ret = store->get_bucket_info(obj_ctx, entry.tenant, entry.bucket_name,
+                                    bucket_info, nullptr, &attrs);
         if (ret < 0) {
-         ldout(cct, 0) <<  __func__ << ": Error in get_bucket_info: " << cpp_strerror(-ret) << dendl;
+         ldout(cct, 0) <<  __func__ << ": Error in get_bucket_info: " <<
+           cpp_strerror(-ret) << dendl;
           return -ret;
         }
  
-       RGWBucketReshard br(store, bucket_info, attrs);
+       RGWBucketReshard br(store, bucket_info, attrs, nullptr);
  
         Formatter* formatter = new JSONFormatter(false);
         auto formatter_ptr = std::unique_ptr<Formatter>(formatter);
-       ret = br.execute(entry.new_num_shards, max_entries, true,nullptr, formatter, this);
+       ret = br.execute(entry.new_num_shards, max_entries, true, nullptr,
+                        formatter, this);
         if (ret < 0) {
-         ldout (store->ctx(), 0) <<  __func__ << "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
+         ldout (store->ctx(), 0) <<  __func__ <<
+           "ERROR in reshard_bucket " << entry.bucket_name << ":" <<
             cpp_strerror(-ret)<< dendl;
           return ret;
         }
  
-       ldout (store->ctx(), 20) <<  " removing entry" << entry.bucket_name<< dendl;
+       ldout (store->ctx(), 20) <<  " removing entry" << entry.bucket_name <<
+         dendl;
  
         ret = remove(entry);
         if (ret < 0) {
-         ldout(cct, 0)<< __func__ << ":Error removing bucket " << entry.bucket_name << " for resharding queue: "
-                      << cpp_strerror(-ret) << dendl;
+         ldout(cct, 0)<< __func__ << ":Error removing bucket " <<
+           entry.bucket_name << " for resharding queue: " <<
+           cpp_strerror(-ret) << dendl;
           return ret;
         }
        }
-      utime_t now = ceph_clock_now();
-
-      if (now > lock_start_time + max_secs / 2) { /* do you need to renew lock? */
-        l.set_renew(true);
-        ret = l.lock_exclusive(&store->reshard_pool_ctx, logshard_oid);
-        if (ret == -EBUSY) { /* already locked by another processor */
-          ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " << logshard_oid << dendl;
-          return ret;
-        }
-        lock_start_time = now;
+
+      Clock::time_point now = Clock::now();
+      if (logshard_lock.should_renew(now)) {
+       ret = logshard_lock.renew(now);
+       if (ret < 0) {
+         return ret;
+       }
        }
+
        entry.get_key(&marker);
      }
    } while (truncated);
  
-  l.unlock(&store->reshard_pool_ctx, logshard_oid);
+  logshard_lock.unlock();
    return 0;
  }
  
diff --git a/ceph/src/rgw/rgw_reshard.h b/ceph/src/rgw/rgw_reshard.h

index 6fe43129906f0fdd642d37594c3895a693a7d076..be47c86b2c93546122bb6de73b1424da58cc9708 100644 (file)
--- a/ceph/src/rgw/rgw_reshard.h
+++ b/ceph/src/rgw/rgw_reshard.h
@@ -5,31 +5,72 @@
  #define RGW_RESHARD_H
  
  #include <vector>
+#include <functional>
+
  #include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
  #include "cls/rgw/cls_rgw_types.h"
  #include "cls/lock/cls_lock_client.h"
  #include "rgw_bucket.h"
  
+
  class CephContext;
  class RGWRados;
  
+class RGWBucketReshardLock {
+  using Clock = ceph::coarse_mono_clock;
+
+  RGWRados* store;
+  const std::string lock_oid;
+  const bool ephemeral;
+  rados::cls::lock::Lock internal_lock;
+  std::chrono::seconds duration;
+
+  Clock::time_point start_time;
+  Clock::time_point renew_thresh;
+
+  void reset_time(const Clock::time_point& now) {
+    start_time = now;
+    renew_thresh = start_time + duration / 2;
+  }
+
+public:
+  RGWBucketReshardLock(RGWRados* _store,
+                      const std::string& reshard_lock_oid,
+                      bool _ephemeral);
+  RGWBucketReshardLock(RGWRados* _store,
+                      const RGWBucketInfo& bucket_info,
+                      bool _ephemeral) :
+    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+  {}
+
+  int lock();
+  void unlock();
+  int renew(const Clock::time_point&);
+
+  bool should_renew(const Clock::time_point& now) const {
+    return now >= renew_thresh;
+  }
+}; // class RGWBucketReshardLock
  
  class RGWBucketReshard {
+public:
+
    friend class RGWReshard;
  
+  using Clock = ceph::coarse_mono_clock;
+
+private:
+
    RGWRados *store;
    RGWBucketInfo bucket_info;
    std::map<string, bufferlist> bucket_attrs;
  
-  string reshard_oid;
-  rados::cls::lock::Lock reshard_lock;
-
-  int lock_bucket();
-  void unlock_bucket();
-  int set_resharding_status(const string& new_instance_id, int32_t num_shards, cls_rgw_reshard_status status);
-  int clear_resharding();
+  RGWBucketReshardLock reshard_lock;
+  RGWBucketReshardLock* outer_reshard_lock;
  
-  int create_new_bucket_instance(int new_num_shards, RGWBucketInfo& new_bucket_info);
+  int create_new_bucket_instance(int new_num_shards,
+                                RGWBucketInfo& new_bucket_info);
    int do_reshard(int num_shards,
                  RGWBucketInfo& new_bucket_info,
                  int max_entries,
@@ -37,19 +78,46 @@ class RGWBucketReshard {
                   ostream *os,
                  Formatter *formatter);
  public:
-  RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
-                   const std::map<string, bufferlist>& _bucket_attrs);
  
+  // pass nullptr for the final parameter if no outer reshard lock to
+  // manage
+  RGWBucketReshard(RGWRados *_store, const RGWBucketInfo& _bucket_info,
+                   const std::map<string, bufferlist>& _bucket_attrs,
+                  RGWBucketReshardLock* _outer_reshard_lock);
    int execute(int num_shards, int max_op_entries,
                bool verbose = false, ostream *out = nullptr,
                Formatter *formatter = nullptr,
               RGWReshard *reshard_log = nullptr);
-  int abort();
    int get_status(std::list<cls_rgw_bucket_instance_entry> *status);
    int cancel();
-};
+  static int clear_resharding(RGWRados* store,
+                             const RGWBucketInfo& bucket_info);
+  int clear_resharding() {
+    return clear_resharding(store, bucket_info);
+  }
+  static int clear_index_shard_reshard_status(RGWRados* store,
+                                             const RGWBucketInfo& bucket_info);
+  int clear_index_shard_reshard_status() {
+    return clear_index_shard_reshard_status(store, bucket_info);
+  }
+  static int set_resharding_status(RGWRados* store,
+                                  const RGWBucketInfo& bucket_info,
+                                  const string& new_instance_id,
+                                  int32_t num_shards,
+                                  cls_rgw_reshard_status status);
+  int set_resharding_status(const string& new_instance_id,
+                           int32_t num_shards,
+                           cls_rgw_reshard_status status) {
+    return set_resharding_status(store, bucket_info,
+                                new_instance_id, num_shards, status);
+  }
+}; // RGWBucketReshard
  
  class RGWReshard {
+public:
+    using Clock = ceph::coarse_mono_clock;
+
+private:
      RGWRados *store;
      string lock_name;
      rados::cls::lock::Lock instance_lock;
@@ -116,7 +184,9 @@ public:
    ~RGWReshardWait() {
      assert(going_down);
    }
-  int block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+                            string *new_bucket_id,
+                            const RGWBucketInfo& bucket_info);
  
    void stop() {
      Mutex::Locker l(lock);
diff --git a/ceph/src/rgw/rgw_rest.cc b/ceph/src/rgw/rgw_rest.cc

index 80a886ec5d118ba6cf2271a80a632d14f3000828..67de9bb9b068cb7ecb3cced61e43615bb99ead25 100644 (file)
--- a/ceph/src/rgw/rgw_rest.cc
+++ b/ceph/src/rgw/rgw_rest.cc
@@ -1659,8 +1659,9 @@ int RGWListMultipart_ObjStore::get_params()
    }
    
    string str = s->info.args.get("max-parts");
-  if (!str.empty())
-    max_parts = atoi(str.c_str());
+  op_ret = parse_value_and_bound(str, max_parts, 0,
+                                 g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+                                 max_parts);
  
    return op_ret;
  }
@@ -1670,10 +1671,12 @@ int RGWListBucketMultiparts_ObjStore::get_params()
    delimiter = s->info.args.get("delimiter");
    prefix = s->info.args.get("prefix");
    string str = s->info.args.get("max-uploads");
-  if (!str.empty())
-    max_uploads = atoi(str.c_str());
-  else
-    max_uploads = default_max;
+  op_ret = parse_value_and_bound(str, max_uploads, 0,
+                       g_conf->get_val<uint64_t>("rgw_max_listing_results"),
+                       default_max);
+  if (op_ret < 0) {
+    return op_ret;
+  }
  
    string key_marker = s->info.args.get("key-marker");
    string upload_id_marker = s->info.args.get("upload-id-marker");
diff --git a/ceph/src/rgw/rgw_rest_s3.cc b/ceph/src/rgw/rgw_rest_s3.cc

index 3b07327f38d7ce2c6a76b4157a098b6f2ea8270c..6534e254ec2360802055200eaf8535ddbf2b9ecc 100644 (file)
--- a/ceph/src/rgw/rgw_rest_s3.cc
+++ b/ceph/src/rgw/rgw_rest_s3.cc
@@ -3751,8 +3751,9 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
    boost::optional<std::string> canonical_headers = \
      get_v4_canonical_headers(s->info, signed_hdrs, using_qs);
    if (canonical_headers) {
-    ldout(s->cct, 10) << "canonical headers format = " << *canonical_headers
-                      << dendl;
+    using sanitize = rgw::crypt_sanitize::log_content;
+    ldout(s->cct, 10) << "canonical headers format = "
+                      << sanitize{*canonical_headers} << dendl;
    } else {
      throw -EPERM;
    }
diff --git a/ceph/src/rgw/rgw_rest_swift.cc b/ceph/src/rgw/rgw_rest_swift.cc

index c9d96d9631bf2a6841c23b85e409523a2725971f..35e192c150ed350736a819609118c362bd5ac6dc 100644 (file)
--- a/ceph/src/rgw/rgw_rest_swift.cc
+++ b/ceph/src/rgw/rgw_rest_swift.cc
@@ -303,6 +303,8 @@ int RGWListBucket_ObjStore_SWIFT::get_params()
    if (op_ret < 0) {
      return op_ret;
    }
+  // S3 behavior is to silently cap the max-keys.
+  // Swift behavior is to abort.
    if (max > default_max)
      return -ERR_PRECONDITION_FAILED;
  
diff --git a/ceph/src/rgw/rgw_rest_user.cc b/ceph/src/rgw/rgw_rest_user.cc

index ee526e3c03a41fdc80e1d834dbebf3c192d47c58..bd561d0c470cf6183b512398cb996051f5a34918 100644 (file)
--- a/ceph/src/rgw/rgw_rest_user.cc
+++ b/ceph/src/rgw/rgw_rest_user.cc
@@ -151,37 +151,6 @@ void RGWOp_User_Create::execute()
    if (gen_key)
      op_state.set_generate_key();
  
-  RGWQuotaInfo bucket_quota;
-  RGWQuotaInfo user_quota;
-
-  if (s->cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
-    bucket_quota.max_objects = s->cct->_conf->rgw_bucket_default_quota_max_objects;
-    bucket_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
-    bucket_quota.max_size = s->cct->_conf->rgw_bucket_default_quota_max_size;
-    bucket_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_user_default_quota_max_objects >= 0) {
-    user_quota.max_objects = s->cct->_conf->rgw_user_default_quota_max_objects;
-    user_quota.enabled = true;
-  }
-
-  if (s->cct->_conf->rgw_user_default_quota_max_size >= 0) {
-    user_quota.max_size = s->cct->_conf->rgw_user_default_quota_max_size;
-    user_quota.enabled = true;
-  }
-
-  if (bucket_quota.enabled) {
-    op_state.set_bucket_quota(bucket_quota);
-  }
-
-  if (user_quota.enabled) {
-    op_state.set_user_quota(user_quota);
-  }
-
    http_ret = RGWUserAdminOp_User::create(store, op_state, flusher);
  }
  
@@ -895,11 +864,14 @@ void RGWOp_Quota_Set::execute()
          old_quota = &info.bucket_quota;
        }
  
-      int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
-      int64_t max_size_kb;
        RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
-      RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb);
-      quota.max_size = max_size_kb * 1024;
+      RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+      int64_t max_size_kb;
+      bool has_max_size_kb = false;
+      RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+      if (has_max_size_kb) {
+        quota.max_size = max_size_kb * 1024;
+      }
        RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
      }
  
diff --git a/ceph/src/rgw/rgw_sync_log_trim.cc b/ceph/src/rgw/rgw_sync_log_trim.cc

index 418441996f2b47fe6f878416a741cac7bae1f618..ffec32052370503df29b2503cf7eb61d95987229 100644 (file)
--- a/ceph/src/rgw/rgw_sync_log_trim.cc
+++ b/ceph/src/rgw/rgw_sync_log_trim.cc
@@ -17,6 +17,7 @@
  #include <boost/circular_buffer.hpp>
  #include <boost/container/flat_map.hpp>
  
+#include "include/scope_guard.h"
  #include "common/bounded_key_counter.h"
  #include "common/errno.h"
  #include "rgw_sync_log_trim.h"
@@ -573,7 +574,6 @@ class AsyncMetadataList : public RGWAsyncRadosRequest {
    const std::string section;
    const std::string start_marker;
    MetadataListCallback callback;
-  void *handle{nullptr};
  
    int _send_request() override;
   public:
@@ -584,54 +584,55 @@ class AsyncMetadataList : public RGWAsyncRadosRequest {
      : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
        section(section), start_marker(start_marker), callback(callback)
    {}
-  ~AsyncMetadataList() override {
-    if (handle) {
-      mgr->list_keys_complete(handle);
-    }
-  }
  };
  
  int AsyncMetadataList::_send_request()
  {
+  void* handle = nullptr;
+  std::list<std::string> keys;
+  bool truncated{false};
+  std::string marker;
+
    // start a listing at the given marker
    int r = mgr->list_keys_init(section, start_marker, &handle);
-  if (r < 0) {
+  if (r == -EINVAL) {
+    // restart with empty marker below
+  } else if (r < 0) {
      ldout(cct, 10) << "failed to init metadata listing: "
          << cpp_strerror(r) << dendl;
      return r;
-  }
-  ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
-
-  std::list<std::string> keys;
-  bool truncated{false};
-  std::string marker;
-
-  do {
-    // get the next key and marker
-    r = mgr->list_keys_next(handle, 1, keys, &truncated);
-    if (r < 0) {
-      ldout(cct, 10) << "failed to list metadata: "
-          << cpp_strerror(r) << dendl;
-      return r;
-    }
-    marker = mgr->get_marker(handle);
-
-    if (!keys.empty()) {
-      assert(keys.size() == 1);
-      auto& key = keys.front();
-      if (!callback(std::move(key), std::move(marker))) {
-        return 0;
+  } else {
+    ldout(cct, 20) << "starting metadata listing at " << start_marker << dendl;
+
+    // release the handle when scope exits
+    auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
+
+    do {
+      // get the next key and marker
+      r = mgr->list_keys_next(handle, 1, keys, &truncated);
+      if (r < 0) {
+        ldout(cct, 10) << "failed to list metadata: "
+            << cpp_strerror(r) << dendl;
+        return r;
        }
-    }
-  } while (truncated);
+      marker = mgr->get_marker(handle);
+
+      if (!keys.empty()) {
+        ceph_assert(keys.size() == 1);
+        auto& key = keys.front();
+        if (!callback(std::move(key), std::move(marker))) {
+          return 0;
+        }
+      }
+    } while (truncated);
  
-  if (start_marker.empty()) {
-    // already listed all keys
-    return 0;
+    if (start_marker.empty()) {
+      // already listed all keys
+      return 0;
+    }
    }
  
    // restart the listing from the beginning (empty marker)
-  mgr->list_keys_complete(handle);
    handle = nullptr;
  
    r = mgr->list_keys_init(section, "", &handle);
@@ -642,6 +643,8 @@ int AsyncMetadataList::_send_request()
    }
    ldout(cct, 20) << "restarting metadata listing" << dendl;
  
+  // release the handle when scope exits
+  auto g = make_scope_guard([=] { mgr->list_keys_complete(handle); });
    do {
      // get the next key and marker
      r = mgr->list_keys_next(handle, 1, keys, &truncated);
diff --git a/ceph/src/rgw/rgw_sync_module_es.cc b/ceph/src/rgw/rgw_sync_module_es.cc

index 775fcd24e2d363a3aba0d9cf3495fef1a5330030..aa58219b5a9c2f66912b31299723fb77b54bb544 100644 (file)
--- a/ceph/src/rgw/rgw_sync_module_es.cc
+++ b/ceph/src/rgw/rgw_sync_module_es.cc
@@ -243,6 +243,19 @@ struct es_index_config {
    }
  };
  
+static bool is_sys_attr(const std::string& attr_name){
+  static constexpr std::initializer_list<const char*> rgw_sys_attrs = {RGW_ATTR_PG_VER,
+                                                                       RGW_ATTR_SOURCE_ZONE,
+                                                                       RGW_ATTR_ID_TAG,
+                                                                       RGW_ATTR_TEMPURL_KEY1,
+                                                                       RGW_ATTR_TEMPURL_KEY2,
+                                                                       RGW_ATTR_UNIX1,
+                                                                       RGW_ATTR_UNIX_KEY1
+  };
+
+  return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
  struct es_obj_metadata {
    CephContext *cct;
    ElasticConfigRef es_conf;
@@ -267,7 +280,6 @@ struct es_obj_metadata {
  
      for (auto i : attrs) {
        const string& attr_name = i.first;
-      string name;
        bufferlist& val = i.second;
  
        if (attr_name.compare(0, sizeof(RGW_ATTR_PREFIX) - 1, RGW_ATTR_PREFIX) != 0) {
@@ -275,19 +287,22 @@ struct es_obj_metadata {
        }
  
        if (attr_name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
-        name = attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1);
-        custom_meta[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+        custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+                            string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
          continue;
        }
  
-      name = attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1);
+      if (attr_name.compare(0, sizeof(RGW_ATTR_CRYPT_PREFIX) -1, RGW_ATTR_CRYPT_PREFIX) == 0) {
+        continue;
+      }
  
-      if (name == "acl") {
+      if (attr_name == RGW_ATTR_ACL) {
          try {
            auto i = val.begin();
            ::decode(policy, i);
          } catch (buffer::error& err) {
            ldout(cct, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+          continue;
          }
  
          const RGWAccessControlList& acl = policy.get_acl();
@@ -303,19 +318,30 @@ struct es_obj_metadata {
              }
            }
          }
-      } else if (name == "x-amz-tagging") {
-        auto tags_bl = val.begin();
-        ::decode(obj_tags, tags_bl);
-      } else if (name == "compression") {
+      } else if (attr_name == RGW_ATTR_TAGS) {
+        try {
+          auto tags_bl = val.begin();
+          ::decode(obj_tags, tags_bl);
+        } catch (buffer::error& err) {
+          ldout(cct,0) << "ERROR: failed to decode obj tags for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+      } else if (attr_name == RGW_ATTR_COMPRESSION) {
          RGWCompressionInfo cs_info;
-        auto vals_bl = val.begin();
-        decode(cs_info, vals_bl);
-        out_attrs[name] = cs_info.compression_type;
+        try {
+          auto vals_bl = val.begin();
+          ::decode(cs_info, vals_bl);
+        } catch (buffer::error& err) {
+          ldout(cct,0) << "ERROR: failed to decode compression attr for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+        out_attrs.emplace("compression",std::move(cs_info.compression_type));
        } else {
-        if (name != "pg_ver" &&
-            name != "source_zone" &&
-            name != "idtag") {
-          out_attrs[name] = string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0));
+        if (!is_sys_attr(attr_name)) {
+          out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+                            std::string(val.c_str(), (val.length() > 0 ? val.length() - 1 : 0)));
          }
        }
      }
diff --git a/ceph/src/rgw/rgw_user.cc b/ceph/src/rgw/rgw_user.cc

index d0e70602844c32dcbe9396c408a3379d19d768a3..6ba69335abf0c54cbcb3407b2adef4e6d4906acb 100644 (file)
--- a/ceph/src/rgw/rgw_user.cc
+++ b/ceph/src/rgw/rgw_user.cc
@@ -23,6 +23,7 @@
  #include "rgw_common.h"
  
  #include "rgw_bucket.h"
+#include "rgw_quota.h"
  
  #define dout_subsys ceph_subsys_rgw
  
@@ -1992,14 +1993,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
    if (op_state.has_bucket_quota()) {
      user_info.bucket_quota = op_state.get_bucket_quota();
    } else {
-    if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
-      user_info.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
-      user_info.bucket_quota.enabled = true;
-    }
-    if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
-      user_info.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
-      user_info.bucket_quota.enabled = true;
-    }
+    rgw_apply_default_bucket_quota(user_info.bucket_quota, cct);
    }
  
    if (op_state.temp_url_key_specified) {
@@ -2013,14 +2007,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
    if (op_state.has_user_quota()) {
      user_info.user_quota = op_state.get_user_quota();
    } else {
-    if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
-      user_info.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
-      user_info.user_quota.enabled = true;
-    }
-    if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
-      user_info.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
-      user_info.user_quota.enabled = true;
-    }
+    rgw_apply_default_user_quota(user_info.user_quota, cct);
    }
  
    // update the request
diff --git a/ceph/src/test/cli/crushtool/crush-classes/a b/ceph/src/test/cli/crushtool/crush-classes/a

new file mode 100644 (file)

index 0000000..3e7d168

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/a differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/b b/ceph/src/test/cli/crushtool/crush-classes/b

new file mode 100644 (file)

index 0000000..68c900f

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/b differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/beesly b/ceph/src/test/cli/crushtool/crush-classes/beesly

new file mode 100644 (file)

index 0000000..0048b14

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/beesly differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/c b/ceph/src/test/cli/crushtool/crush-classes/c

new file mode 100644 (file)

index 0000000..e709cb9

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/c differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/d b/ceph/src/test/cli/crushtool/crush-classes/d

new file mode 100644 (file)

index 0000000..e431636

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/d differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/e b/ceph/src/test/cli/crushtool/crush-classes/e

new file mode 100644 (file)

index 0000000..a1720c7

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/e differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/f b/ceph/src/test/cli/crushtool/crush-classes/f

new file mode 100644 (file)

index 0000000..c91e426

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/f differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/flax b/ceph/src/test/cli/crushtool/crush-classes/flax

new file mode 100644 (file)

index 0000000..4f579dd

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/flax differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/g b/ceph/src/test/cli/crushtool/crush-classes/g

new file mode 100644 (file)

index 0000000..0e27a9d

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/g differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/gabe b/ceph/src/test/cli/crushtool/crush-classes/gabe

new file mode 100644 (file)

index 0000000..90e8ed7

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/gabe differ
diff --git a/ceph/src/test/cli/crushtool/crush-classes/gabe2 b/ceph/src/test/cli/crushtool/crush-classes/gabe2

new file mode 100644 (file)

index 0000000..c91e426

Binary files /dev/null and b/ceph/src/test/cli/crushtool/crush-classes/gabe2 differ
diff --git a/ceph/src/test/cli/crushtool/help.t b/ceph/src/test/cli/crushtool/help.t

old mode 100755 (executable)

new mode 100644 (file)

index 389b061..1621753
--- a/ceph/src/test/cli/crushtool/help.t
+++ b/ceph/src/test/cli/crushtool/help.t
@@ -63,6 +63,8 @@
       -i mapfn --move       name --loc type name ...
                             move the given item to specified location
       -i mapfn --reweight   recalculate all bucket weights
+     -i mapfn --rebuild-class-roots
+                           rebuild the per-class shadow trees (normally a no-op)
       -i mapfn --create-simple-rule name root type mode
                             create crush rule <name> to start from <root>,
                             replicate across buckets of type <type>, using
@@ -110,6 +112,13 @@
                             export select data generated during testing routine
                             to CSV files for off-line post-processing
                             use --help-output for more information
+     --reclassify          transform legacy CRUSH map buckets and rules
+                           by adding classes
+        --reclassify-bucket <bucket-match> <class> <default-parent>
+        --reclassify-root <bucket-name> <class>
+     --set-subtree-class <bucket-name> <class>
+                           set class for all items beneath bucket-name
+     --compare <otherfile> compare two maps using --test parameters
    
    Options for the output stage
    
diff --git a/ceph/src/test/cli/crushtool/reclassify.t b/ceph/src/test/cli/crushtool/reclassify.t

new file mode 100644 (file)

index 0000000..500cd0d
--- /dev/null
+++ b/ceph/src/test/cli/crushtool/reclassify.t
@@ -0,0 +1,588 @@
+  $ crushtool -i $TESTDIR/crush-classes/a --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -5
+    renumbering bucket -4 -> -6
+    renumbering bucket -3 -> -7
+    renumbering bucket -2 -> -8
+  classify_bucket %-ssd as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-ssd to ttipod001-cephosd-2-ssd basename ttipod001-cephosd-2
+    have base -8
+  match %-ssd to ttipod001-cephosd-1-ssd basename ttipod001-cephosd-1
+    have base -7
+  match %-ssd to ttipod001-cephosd-3-ssd basename ttipod001-cephosd-3
+    have base -6
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -5
+  moving items from -24 (ttipod001-cephosd-3-ssd) to -6 (ttipod001-cephosd-3)
+  moving items from -23 (ttipod001-cephosd-1-ssd) to -7 (ttipod001-cephosd-1)
+  moving items from -22 (ttipod001-cephosd-2-ssd) to -8 (ttipod001-cephosd-2)
+  moving items from -21 (ssd) to -5 (default)
+  $ crushtool -i $TESTDIR/crush-classes/a --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/d --set-subtree-class default hdd --reclassify --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -13
+    renumbering bucket -6 -> -14
+    renumbering bucket -5 -> -15
+    renumbering bucket -4 -> -16
+    renumbering bucket -3 -> -17
+    renumbering bucket -2 -> -18
+  classify_bucket %-ssd as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-ssd to node-20-ssd basename node-20
+    have base -18
+  match %-ssd to node-21-ssd basename node-21
+    created base -25
+  match %-ssd to node-22-ssd basename node-22
+    created base -26
+  match %-ssd to node-23-ssd basename node-23
+    created base -27
+  match %-ssd to node-27-ssd basename node-27
+    created base -28
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -13
+  moving items from -12 (node-27-ssd) to -28 (node-27)
+  moving items from -11 (node-23-ssd) to -27 (node-23)
+  moving items from -10 (node-22-ssd) to -26 (node-22)
+  moving items from -9 (node-21-ssd) to -25 (node-21)
+  moving items from -8 (node-20-ssd) to -18 (node-20)
+  moving items from -7 (ssd) to -13 (default)
+  $ crushtool -i $TESTDIR/crush-classes/d --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/e --reclassify --reclassify-bucket ceph-osd-ssd-% ssd default --reclassify-bucket ssd-root ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 1
+    renumbering bucket -1 -> -55
+    renumbering bucket -34 -> -56
+    renumbering bucket -20 -> -57
+    renumbering bucket -14 -> -58
+    renumbering bucket -15 -> -59
+    renumbering bucket -16 -> -60
+    renumbering bucket -52 -> -61
+    renumbering bucket -46 -> -62
+    renumbering bucket -40 -> -63
+  classify_bucket ceph-osd-ssd-% as ssd default bucket default (root)
+    new class ssd exists as 0
+  match ceph-osd-ssd-% to ceph-osd-ssd-node4 basename node4
+    have base -57
+  match ceph-osd-ssd-% to ceph-osd-ssd-node3 basename node3
+    have base -58
+  match ceph-osd-ssd-% to ceph-osd-ssd-node1 basename node1
+    have base -60
+  match ceph-osd-ssd-% to ceph-osd-ssd-node2 basename node2
+    have base -59
+  match ceph-osd-ssd-% to ceph-osd-ssd-node5 basename node5
+    have base -56
+  match ceph-osd-ssd-% to ceph-osd-ssd-node6 basename node6
+    have base -63
+  match ceph-osd-ssd-% to ceph-osd-ssd-node7 basename node7
+    have base -62
+  match ceph-osd-ssd-% to ceph-osd-ssd-node8 basename node8
+    have base -61
+  classify_bucket ssd-root as ssd default bucket default (root)
+    new class ssd exists as 0
+  match ssd-root to ssd-root basename default
+    have base -55
+  moving items from -49 (ceph-osd-ssd-node8) to -61 (node8)
+  moving items from -43 (ceph-osd-ssd-node7) to -62 (node7)
+  moving items from -37 (ceph-osd-ssd-node6) to -63 (node6)
+  moving items from -31 (ceph-osd-ssd-node5) to -56 (node5)
+  moving items from -18 (ssd-root) to -55 (default)
+  moving items from -9 (ceph-osd-ssd-node2) to -59 (node2)
+  moving items from -7 (ceph-osd-ssd-node1) to -60 (node1)
+  moving items from -5 (ceph-osd-ssd-node3) to -58 (node3)
+  moving items from -3 (ceph-osd-ssd-node4) to -57 (node4)
+
+this one has weird node weights, so *lots* of mappings change...
+
+  $ crushtool -i $TESTDIR/crush-classes/e --compare foo
+  rule 0 had 6540/10240 mismatched mappings (0.638672)
+  rule 1 had 8417/10240 mismatched mappings (0.821973)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/c --reclassify --reclassify-bucket %-SSD ssd default --reclassify-bucket ssd ssd default --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -55
+    renumbering bucket -9 -> -56
+    renumbering bucket -8 -> -57
+    renumbering bucket -7 -> -58
+    renumbering bucket -6 -> -59
+    renumbering bucket -5 -> -60
+    renumbering bucket -4 -> -61
+    renumbering bucket -3 -> -62
+    renumbering bucket -2 -> -63
+  classify_bucket %-SSD as ssd default bucket default (root)
+    created new class ssd as 2
+  match %-SSD to Ceph-Stor1-SSD basename Ceph-Stor1
+    have base -63
+  match %-SSD to Ceph-Stor2-SSD basename Ceph-Stor2
+    have base -62
+  match %-SSD to Ceph-Stor3-SSD basename Ceph-Stor3
+    have base -61
+  match %-SSD to Ceph-Stor4-SSD basename Ceph-Stor4
+    have base -60
+  match %-SSD to Ceph-Stor5-SSD basename Ceph-Stor5
+    have base -59
+  match %-SSD to Ceph-Stor6-SSD basename Ceph-Stor6
+    have base -58
+  match %-SSD to Ceph-Stor7-SSD basename Ceph-Stor7
+    have base -57
+  match %-SSD to Ceph-Stor8-SSD basename Ceph-Stor8
+    have base -56
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 2
+  match ssd to ssd basename default
+    have base -55
+  moving items from -18 (ssd) to -55 (default)
+  moving items from -17 (Ceph-Stor8-SSD) to -56 (Ceph-Stor8)
+  moving items from -16 (Ceph-Stor7-SSD) to -57 (Ceph-Stor7)
+  moving items from -15 (Ceph-Stor6-SSD) to -58 (Ceph-Stor6)
+  moving items from -14 (Ceph-Stor5-SSD) to -59 (Ceph-Stor5)
+  moving items from -13 (Ceph-Stor4-SSD) to -60 (Ceph-Stor4)
+  moving items from -12 (Ceph-Stor3-SSD) to -61 (Ceph-Stor3)
+  moving items from -11 (Ceph-Stor2-SSD) to -62 (Ceph-Stor2)
+  moving items from -10 (Ceph-Stor1-SSD) to -63 (Ceph-Stor1)
+
+wonky crush weights on Ceph-Stor1, so a small number of mappings change
+because the new map has a strictly summing hierarchy.
+
+  $ crushtool -i $TESTDIR/crush-classes/c --compare foo
+  rule 0 had 158/10240 mismatched mappings (0.0154297)
+  rule 1 had 62/5120 mismatched mappings (0.0121094)
+  rule 2 had 0/10240 mismatched mappings (0)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/beesly --set-subtree-class 0513-R-0060 hdd --set-subtree-class 0513-R-0050 hdd --reclassify --reclassify-root 0513-R-0050 hdd --reclassify-root 0513-R-0060 hdd -o foo
+  classify_root 0513-R-0050 (-2) as hdd
+    new class hdd exists as 0
+    renumbering bucket -2 -> -131
+    renumbering bucket -14 -> -132
+    renumbering bucket -34 -> -133
+    renumbering bucket -33 -> -134
+    renumbering bucket -30 -> -135
+    renumbering bucket -26 -> -136
+    renumbering bucket -22 -> -137
+    renumbering bucket -18 -> -138
+    renumbering bucket -13 -> -139
+    renumbering bucket -9 -> -140
+    renumbering bucket -12 -> -141
+    renumbering bucket -11 -> -142
+    renumbering bucket -32 -> -143
+    renumbering bucket -31 -> -144
+    renumbering bucket -10 -> -145
+    renumbering bucket -8 -> -146
+    renumbering bucket -6 -> -147
+    renumbering bucket -28 -> -148
+    renumbering bucket -27 -> -149
+    renumbering bucket -21 -> -150
+    renumbering bucket -20 -> -151
+    renumbering bucket -19 -> -152
+    renumbering bucket -7 -> -153
+    renumbering bucket -5 -> -154
+    renumbering bucket -4 -> -155
+    renumbering bucket -25 -> -156
+    renumbering bucket -24 -> -157
+    renumbering bucket -23 -> -158
+    renumbering bucket -17 -> -159
+    renumbering bucket -16 -> -160
+    renumbering bucket -15 -> -161
+    renumbering bucket -3 -> -162
+    renumbering bucket -72 -> -163
+    renumbering bucket -98 -> -164
+    renumbering bucket -97 -> -165
+    renumbering bucket -96 -> -166
+    renumbering bucket -95 -> -167
+    renumbering bucket -94 -> -168
+    renumbering bucket -93 -> -169
+    renumbering bucket -68 -> -170
+  classify_root 0513-R-0060 (-65) as hdd
+    new class hdd exists as 0
+    renumbering bucket -65 -> -35
+    renumbering bucket -76 -> -36
+    renumbering bucket -78 -> -37
+    renumbering bucket -87 -> -38
+    renumbering bucket -82 -> -39
+    renumbering bucket -81 -> -40
+    renumbering bucket -77 -> -41
+    renumbering bucket -75 -> -42
+    renumbering bucket -89 -> -43
+    renumbering bucket -85 -> -44
+    renumbering bucket -84 -> -45
+    renumbering bucket -74 -> -46
+    renumbering bucket -71 -> -47
+    renumbering bucket -80 -> -48
+    renumbering bucket -91 -> -49
+    renumbering bucket -90 -> -50
+    renumbering bucket -88 -> -51
+    renumbering bucket -79 -> -52
+    renumbering bucket -70 -> -53
+    renumbering bucket -86 -> -54
+    renumbering bucket -83 -> -55
+    renumbering bucket -73 -> -56
+    renumbering bucket -69 -> -57
+  $ crushtool -i $TESTDIR/crush-classes/beesly --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  rule 2 had 0/10240 mismatched mappings (0)
+  rule 4 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/flax --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -5
+    renumbering bucket -12 -> -7
+    renumbering bucket -9 -> -8
+    renumbering bucket -6 -> -10
+    renumbering bucket -4 -> -11
+    renumbering bucket -3 -> -13
+    renumbering bucket -2 -> -14
+  $ crushtool -i $TESTDIR/crush-classes/flax --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/gabe --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    rule 3 includes take on root default class 0
+  failed to reclassify map
+  [1]
+
+above fails because of ec-rack-by-2-hdd also has take default class hdd.
+
+below is an adjusted version of the same cluster's map
+
+  $ crushtool -i $TESTDIR/crush-classes/gabe2 --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -178
+    renumbering bucket -4 -> -179
+    renumbering bucket -25 -> -180
+    renumbering bucket -16 -> -181
+    renumbering bucket -21 -> -182
+    renumbering bucket -19 -> -183
+    renumbering bucket -15 -> -184
+    renumbering bucket -7 -> -185
+    renumbering bucket -47 -> -186
+    renumbering bucket -18 -> -187
+    renumbering bucket -8 -> -188
+    renumbering bucket -6 -> -189
+    renumbering bucket -12 -> -190
+    renumbering bucket -23 -> -191
+    renumbering bucket -22 -> -192
+    renumbering bucket -20 -> -193
+    renumbering bucket -11 -> -194
+    renumbering bucket -10 -> -195
+    renumbering bucket -17 -> -196
+    renumbering bucket -13 -> -197
+    renumbering bucket -9 -> -198
+    renumbering bucket -3 -> -199
+    renumbering bucket -14 -> -200
+    renumbering bucket -5 -> -201
+    renumbering bucket -2 -> -202
+  $ crushtool -i $TESTDIR/crush-classes/gabe2 --compare foo
+  rule 0 had 627/10240 mismatched mappings (0.0612305)
+  rule 1 had 422/6144 mismatched mappings (0.0686849)
+  warning: maps are NOT equivalent
+  [1]
+
+
+
+  $ crushtool -i $TESTDIR/crush-classes/b --reclassify --reclassify-bucket %-hdd hdd default --reclassify-bucket %-ssd ssd default --reclassify-bucket ssd ssd default --reclassify-bucket hdd hdd default -o foo
+  classify_bucket %-hdd as hdd default bucket default (root)
+    new class hdd exists as 0
+  match %-hdd to berta-hdd basename berta
+    have base -37
+  match %-hdd to oelgard-hdd basename oelgard
+    have base -36
+  match %-hdd to leonhard-hdd basename leonhard
+    have base -33
+  match %-hdd to gottlieb-hdd basename gottlieb
+    have base -30
+  match %-hdd to hieronymus-hdd basename hieronymus
+    have base -31
+  match %-hdd to uhu-hdd basename uhu
+    have base -34
+  match %-hdd to euphrosyne-hdd basename euphrosyne
+    have base -35
+  match %-hdd to frauenhaus-hdd basename frauenhaus
+    created base -145
+  match %-hdd to herrenhaus-hdd basename herrenhaus
+    created base -146
+  match %-hdd to zoo-hdd basename zoo
+    created base -147
+  match %-hdd to borkenkaefer-hdd basename borkenkaefer
+    have base -4
+  match %-hdd to hirsch-hdd basename hirsch
+    have base -41
+  match %-hdd to cassowary-hdd basename cassowary
+    created base -148
+  match %-hdd to fuchs-hdd basename fuchs
+    created base -149
+  match %-hdd to analia-hdd basename analia
+    created base -150
+  match %-hdd to gundula-hdd basename gundula
+    created base -151
+  match %-hdd to achim-hdd basename achim
+    created base -152
+  match %-hdd to hugo-hdd basename hugo
+    created base -153
+  match %-hdd to carl-hdd basename carl
+    have base -32
+  classify_bucket %-ssd as ssd default bucket default (root)
+    new class ssd exists as 1
+  match %-ssd to frauenhaus-ssd basename frauenhaus
+    already creating base -145
+  match %-ssd to herrenhaus-ssd basename herrenhaus
+    already creating base -146
+  match %-ssd to zoo-ssd basename zoo
+    already creating base -147
+  match %-ssd to berta-ssd basename berta
+    have base -37
+  match %-ssd to euphrosyne-ssd basename euphrosyne
+    have base -35
+  match %-ssd to oelgard-ssd basename oelgard
+    have base -36
+  match %-ssd to leonhard-ssd basename leonhard
+    have base -33
+  match %-ssd to hieronymus-ssd basename hieronymus
+    have base -31
+  match %-ssd to gottlieb-ssd basename gottlieb
+    have base -30
+  match %-ssd to uhu-ssd basename uhu
+    have base -34
+  match %-ssd to borkenkaefer-ssd basename borkenkaefer
+    have base -4
+  match %-ssd to hirsch-ssd basename hirsch
+    have base -41
+  match %-ssd to phaidon-ssd basename phaidon
+    created base -154
+  match %-ssd to glykera-ssd basename glykera
+    created base -155
+  match %-ssd to bonobo-ssd basename bonobo
+    created base -156
+  classify_bucket hdd as hdd default bucket default (root)
+    new class hdd exists as 0
+  match hdd to hdd basename default
+    have base -1
+  classify_bucket ssd as ssd default bucket default (root)
+    new class ssd exists as 1
+  match ssd to ssd basename default
+    have base -1
+  moving items from -124 (bonobo-ssd) to -156 (bonobo)
+  moving items from -123 (glykera-ssd) to -155 (glykera)
+  moving items from -122 (phaidon-ssd) to -154 (phaidon)
+  moving items from -121 (carl-hdd) to -32 (carl)
+  moving items from -120 (hugo-hdd) to -153 (hugo)
+  moving items from -119 (achim-hdd) to -152 (achim)
+  moving items from -118 (gundula-hdd) to -151 (gundula)
+  moving items from -117 (analia-hdd) to -150 (analia)
+  moving items from -116 (fuchs-hdd) to -149 (fuchs)
+  moving items from -115 (cassowary-hdd) to -148 (cassowary)
+  moving items from -39 (hirsch-ssd) to -41 (hirsch)
+  moving items from -38 (hirsch-hdd) to -41 (hirsch)
+  moving items from -29 (borkenkaefer-ssd) to -4 (borkenkaefer)
+  moving items from -28 (hdd) to -1 (default)
+  moving items from -27 (ssd) to -1 (default)
+  moving items from -26 (uhu-ssd) to -34 (uhu)
+  moving items from -25 (gottlieb-ssd) to -30 (gottlieb)
+  moving items from -24 (hieronymus-ssd) to -31 (hieronymus)
+  moving items from -23 (leonhard-ssd) to -33 (leonhard)
+  moving items from -22 (borkenkaefer-hdd) to -4 (borkenkaefer)
+  moving items from -21 (oelgard-ssd) to -36 (oelgard)
+  moving items from -20 (euphrosyne-ssd) to -35 (euphrosyne)
+  moving items from -19 (berta-ssd) to -37 (berta)
+  moving items from -17 (zoo-ssd) to -147 (zoo)
+  moving items from -16 (herrenhaus-ssd) to -146 (herrenhaus)
+  moving items from -15 (frauenhaus-ssd) to -145 (frauenhaus)
+  moving items from -12 (zoo-hdd) to -147 (zoo)
+  moving items from -11 (herrenhaus-hdd) to -146 (herrenhaus)
+  moving items from -10 (frauenhaus-hdd) to -145 (frauenhaus)
+  moving items from -9 (euphrosyne-hdd) to -35 (euphrosyne)
+  moving items from -8 (uhu-hdd) to -34 (uhu)
+  moving items from -7 (hieronymus-hdd) to -31 (hieronymus)
+  moving items from -6 (gottlieb-hdd) to -30 (gottlieb)
+  moving items from -5 (leonhard-hdd) to -33 (leonhard)
+  moving items from -3 (oelgard-hdd) to -36 (oelgard)
+  moving items from -2 (berta-hdd) to -37 (berta)
+  new bucket -156 missing parent, adding at {root=default}
+  new bucket -155 missing parent, adding at {root=default}
+  new bucket -154 missing parent, adding at {root=default}
+
+  $ crushtool -i $TESTDIR/crush-classes/b --compare foo
+  rule 0 had 0/3072 mismatched mappings (0)
+  rule 1 had 0/4096 mismatched mappings (0)
+  maps appear equivalent
+
+  $ crushtool -i $TESTDIR/crush-classes/f --reclassify --reclassify-root default hdd -o foo
+  classify_root default (-1) as hdd
+    new class hdd exists as 0
+    renumbering bucket -1 -> -178
+    renumbering bucket -4 -> -179
+    renumbering bucket -25 -> -180
+    renumbering bucket -16 -> -181
+    renumbering bucket -21 -> -182
+    renumbering bucket -19 -> -183
+    renumbering bucket -15 -> -184
+    renumbering bucket -7 -> -185
+    renumbering bucket -47 -> -186
+    renumbering bucket -18 -> -187
+    renumbering bucket -8 -> -188
+    renumbering bucket -6 -> -189
+    renumbering bucket -12 -> -190
+    renumbering bucket -23 -> -191
+    renumbering bucket -22 -> -192
+    renumbering bucket -20 -> -193
+    renumbering bucket -11 -> -194
+    renumbering bucket -10 -> -195
+    renumbering bucket -17 -> -196
+    renumbering bucket -13 -> -197
+    renumbering bucket -9 -> -198
+    renumbering bucket -3 -> -199
+    renumbering bucket -14 -> -200
+    renumbering bucket -5 -> -201
+    renumbering bucket -2 -> -202
+
+We expect some mismatches below because there are some ssd-labeled nodes under
+default that we aren't changing the class on.
+
+  $ crushtool -i $TESTDIR/crush-classes/f --compare foo
+  rule 0 had 627/10240 mismatched mappings (0.0612305)
+  rule 1 had 422/6144 mismatched mappings (0.0686849)
+  warning: maps are NOT equivalent
+  [1]
+
+  $ crushtool -i $TESTDIR/crush-classes/g --reclassify --reclassify-bucket sata-% hdd-sata default --reclassify-bucket sas-% hdd-sas default --reclassify-bucket sas hdd-sas default --reclassify-bucket sata hdd-sata default -o foo
+  classify_bucket sas as hdd-sas default bucket default (root)
+    created new class hdd-sas as 1
+  match sas to sas basename default
+    have base -1
+  classify_bucket sas-% as hdd-sas default bucket default (root)
+    new class hdd-sas exists as 1
+  match sas-% to sas-osd01 basename osd01
+    created base -73
+  match sas-% to sas-osd02 basename osd02
+    created base -74
+  match sas-% to sas-osd03 basename osd03
+    created base -75
+  match sas-% to sas-osd04 basename osd04
+    created base -76
+  match sas-% to sas-osd05 basename osd05
+    created base -77
+  match sas-% to sas-osd06 basename osd06
+    created base -78
+  match sas-% to sas-osd07 basename osd07
+    created base -79
+  match sas-% to sas-osd08 basename osd08
+    created base -80
+  match sas-% to sas-osd09 basename osd09
+    created base -81
+  match sas-% to sas-rack1 basename rack1
+    created base -82
+  match sas-% to sas-rack2 basename rack2
+    created base -83
+  match sas-% to sas-rack3 basename rack3
+    created base -84
+  classify_bucket sata as hdd-sata default bucket default (root)
+    created new class hdd-sata as 2
+  match sata to sata basename default
+    have base -1
+  classify_bucket sata-% as hdd-sata default bucket default (root)
+    new class hdd-sata exists as 2
+  match sata-% to sata-osd11 basename osd11
+    created base -85
+  match sata-% to sata-osd10 basename osd10
+    created base -86
+  match sata-% to sata-osd14 basename osd14
+    created base -87
+  match sata-% to sata-osd13 basename osd13
+    created base -88
+  match sata-% to sata-osd12 basename osd12
+    created base -89
+  match sata-% to sata-osd15 basename osd15
+    created base -90
+  match sata-% to sata-osd16 basename osd16
+    created base -91
+  match sata-% to sata-osd18 basename osd18
+    created base -92
+  match sata-% to sata-osd19 basename osd19
+    created base -93
+  match sata-% to sata-osd17 basename osd17
+    created base -94
+  match sata-% to sata-osd20 basename osd20
+    created base -95
+  match sata-% to sata-osd21 basename osd21
+    created base -96
+  match sata-% to sata-osd22 basename osd22
+    created base -97
+  match sata-% to sata-osd23 basename osd23
+    created base -98
+  match sata-% to sata-osd24 basename osd24
+    created base -99
+  match sata-% to sata-osd25 basename osd25
+    created base -100
+  match sata-% to sata-osd26 basename osd26
+    created base -101
+  match sata-% to sata-osd27 basename osd27
+    created base -102
+  match sata-% to sata-rack1 basename rack1
+    already creating base -82
+  match sata-% to sata-rack2 basename rack2
+    already creating base -83
+  match sata-% to sata-rack3 basename rack3
+    already creating base -84
+  moving items from -36 (sas) to -1 (default)
+  moving items from -35 (sata) to -1 (default)
+  moving items from -34 (sas-rack3) to -84 (rack3)
+  moving items from -33 (sas-rack2) to -83 (rack2)
+  moving items from -32 (sas-rack1) to -82 (rack1)
+  moving items from -31 (sata-rack3) to -84 (rack3)
+  moving items from -30 (sata-rack2) to -83 (rack2)
+  moving items from -29 (sata-rack1) to -82 (rack1)
+  moving items from -28 (sas-osd09) to -81 (osd09)
+  moving items from -27 (sas-osd08) to -80 (osd08)
+  moving items from -26 (sas-osd07) to -79 (osd07)
+  moving items from -25 (sas-osd06) to -78 (osd06)
+  moving items from -24 (sas-osd05) to -77 (osd05)
+  moving items from -23 (sas-osd04) to -76 (osd04)
+  moving items from -22 (sas-osd03) to -75 (osd03)
+  moving items from -21 (sas-osd02) to -74 (osd02)
+  moving items from -20 (sata-osd27) to -102 (osd27)
+  moving items from -19 (sas-osd01) to -73 (osd01)
+  moving items from -18 (sata-osd26) to -101 (osd26)
+  moving items from -17 (sata-osd25) to -100 (osd25)
+  moving items from -16 (sata-osd24) to -99 (osd24)
+  moving items from -15 (sata-osd23) to -98 (osd23)
+  moving items from -14 (sata-osd22) to -97 (osd22)
+  moving items from -13 (sata-osd21) to -96 (osd21)
+  moving items from -12 (sata-osd20) to -95 (osd20)
+  moving items from -11 (sata-osd17) to -94 (osd17)
+  moving items from -10 (sata-osd19) to -93 (osd19)
+  moving items from -9 (sata-osd18) to -92 (osd18)
+  moving items from -8 (sata-osd16) to -91 (osd16)
+  moving items from -7 (sata-osd15) to -90 (osd15)
+  moving items from -6 (sata-osd12) to -89 (osd12)
+  moving items from -5 (sata-osd13) to -88 (osd13)
+  moving items from -4 (sata-osd14) to -87 (osd14)
+  moving items from -3 (sata-osd10) to -86 (osd10)
+  moving items from -2 (sata-osd11) to -85 (osd11)
+  $ crushtool -i $TESTDIR/crush-classes/g --compare foo
+  rule 0 had 0/10240 mismatched mappings (0)
+  rule 1 had 0/10240 mismatched mappings (0)
+  maps appear equivalent
diff --git a/ceph/src/test/cli/radosgw-admin/help.t b/ceph/src/test/cli/radosgw-admin/help.t

index f847c7677eb0736d07807e09e8b8098dd67512ac..706a5673932de0a711c4b5073fc3dd2ba53aba16 100644 (file)
--- a/ceph/src/test/cli/radosgw-admin/help.t
+++ b/ceph/src/test/cli/radosgw-admin/help.t
@@ -140,6 +140,8 @@
      reshard list               list all bucket resharding or scheduled to be reshared
      reshard process            process of scheduled reshard jobs
      reshard cancel             cancel resharding a bucket
+    reshard stale-instances list list stale-instances from bucket resharding
+    reshard stale-instances rm   cleanup stale-instances from bucket resharding
      sync error list            list sync error
      sync error trim            trim sync error
    options:
diff --git a/ceph/src/test/cls_lock/test_cls_lock.cc b/ceph/src/test/cls_lock/test_cls_lock.cc

index 0aeed8245e7d8fdae5d34668eb1bed095cb0e450..37d10a19cbcf4f0b5914e568422d76e7d62c7c03 100644 (file)
--- a/ceph/src/test/cls_lock/test_cls_lock.cc
+++ b/ceph/src/test/cls_lock/test_cls_lock.cc
@@ -94,10 +94,10 @@ TEST(ClsLock, TestMultiLocking) {
    ASSERT_EQ(-EEXIST, l.lock_exclusive(&ioctx, oid));
  
    /* test idempotency */
-  l.set_renew(true);
+  l.set_may_renew(true);
    ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
  
-  l.set_renew(false);
+  l.set_may_renew(false);
  
    /* test second client */
    Lock l2(lock_name);
@@ -204,7 +204,7 @@ TEST(ClsLock, TestMeta) {
    /* check new tag */
    string new_tag = "new_tag";
    l.set_tag(new_tag);
-  l.set_renew(true);
+  l.set_may_renew(true);
    ASSERT_EQ(0, l.lock_exclusive(&ioctx, oid));
    lock_info(&ioctx, oid, lock_name, lockers, NULL, &new_tag);
    ASSERT_EQ(1, (int)lockers.size());
@@ -391,3 +391,175 @@ TEST(ClsLock, TestSetCookie) {
  
    ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
  }
+
+TEST(ClsLock, TestRenew) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  ASSERT_EQ(0, ioctx.write(oid1, bl, bl.length(), 0));
+
+  Lock l1(lock_name1);
+  utime_t lock_duration1(5, 0);
+  l1.set_duration(lock_duration1);
+
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+  l1.set_may_renew(true);
+  sleep(2);
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1));
+  sleep(7);
+  ASSERT_EQ(0, l1.lock_exclusive(&ioctx, oid1)) <<
+    "when a cls_lock is set to may_renew, a relock after expiration "
+    "should still work";
+  ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+
+  // ***********************************************
+
+  string oid2 = "foo2";
+  string lock_name2 = "mylock2";
+
+  ASSERT_EQ(0, ioctx.write(oid2, bl, bl.length(), 0));
+
+  Lock l2(lock_name2);
+  utime_t lock_duration2(5, 0);
+  l2.set_duration(lock_duration2);
+
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  l2.set_must_renew(true);
+  sleep(2);
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  sleep(7);
+  ASSERT_EQ(-ENOENT, l2.lock_exclusive(&ioctx, oid2)) <<
+    "when a cls_lock is set to must_renew, a relock after expiration "
+    "should fail";
+  ASSERT_EQ(-ENOENT, l2.unlock(&ioctx, oid2));
+
+  // ***********************************************
+
+  string oid3 = "foo3";
+  string lock_name3 = "mylock3";
+
+  ASSERT_EQ(0, ioctx.write(oid3, bl, bl.length(), 0));
+
+  Lock l3(lock_name3);
+  l3.set_duration(utime_t(5, 0));
+  l3.set_must_renew(true);
+
+  ASSERT_EQ(-ENOENT, l3.lock_exclusive(&ioctx, oid3)) <<
+    "unable to create a lock with must_renew";
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsLock, TestExclusiveEphemeralBasic) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string oid2 = "foo2";
+  string lock_name1 = "mylock1";
+  string lock_name2 = "mylock2";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(5, 0));
+
+  uint64_t size;
+  time_t mod_time;
+
+  l1.set_may_renew(true);
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  ASSERT_EQ(0, ioctx.stat(oid1, &size, &mod_time));
+  sleep(2);
+  ASSERT_EQ(0, l1.unlock(&ioctx, oid1));
+  ASSERT_EQ(-ENOENT, ioctx.stat(oid1, &size, &mod_time));
+
+  // ***********************************************
+
+  Lock l2(lock_name2);
+  utime_t lock_duration2(5, 0);
+  l2.set_duration(utime_t(5, 0));
+
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid2));
+  ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+  sleep(2);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid2));
+  ASSERT_EQ(0, ioctx.stat(oid2, &size, &mod_time));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealEphemeral) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(3, 0));
+
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(4);
+
+  // l1 is expired, l2 can take; l2 is also exclusive_ephemeral
+  Lock l2(lock_name1);
+  l2.set_duration(utime_t(3, 0));
+  ASSERT_EQ(0, l2.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(1);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+  // l2 cannot unlock its expired lock
+  ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+
+TEST(ClsLock, TestExclusiveEphemeralStealExclusive) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist bl;
+
+  string oid1 = "foo1";
+  string lock_name1 = "mylock1";
+
+  Lock l1(lock_name1);
+  l1.set_duration(utime_t(3, 0));
+
+  ASSERT_EQ(0, l1.lock_exclusive_ephemeral(&ioctx, oid1));
+  sleep(4);
+
+  // l1 is expired, l2 can take; l2 is exclusive (but not ephemeral)
+  Lock l2(lock_name1);
+  l2.set_duration(utime_t(3, 0));
+  ASSERT_EQ(0, l2.lock_exclusive(&ioctx, oid1));
+  sleep(1);
+  ASSERT_EQ(0, l2.unlock(&ioctx, oid1));
+
+  // l2 cannot unlock its expired lock
+  ASSERT_EQ(-ENOENT, l1.unlock(&ioctx, oid1));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
diff --git a/ceph/src/test/cls_rgw/test_cls_rgw.cc b/ceph/src/test/cls_rgw/test_cls_rgw.cc

index 2b51268fd3b5769639b20d5f6375a7e6cdaea4c8..8fd46085390520c551b232134483dc6083fe358d 100644 (file)
--- a/ceph/src/test/cls_rgw/test_cls_rgw.cc
+++ b/ceph/src/test/cls_rgw/test_cls_rgw.cc
@@ -116,7 +116,7 @@ TEST(cls_rgw, index_basic)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    uint64_t epoch = 1;
@@ -150,7 +150,7 @@ TEST(cls_rgw, index_multiple_obj_writers)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    uint64_t obj_size = 1024;
@@ -187,7 +187,7 @@ TEST(cls_rgw, index_remove_object)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    uint64_t obj_size = 1024;
@@ -279,7 +279,7 @@ TEST(cls_rgw, index_suggest)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    uint64_t total_size = 0;
@@ -390,7 +390,7 @@ TEST(cls_rgw, index_list)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    uint64_t epoch = 1;
@@ -452,7 +452,7 @@ TEST(cls_rgw, bi_list)
    OpMgr mgr;
  
    ObjectWriteOperation *op = mgr.write_op();
-  cls_rgw_bucket_init(*op);
+  cls_rgw_bucket_init_index(*op);
    ASSERT_EQ(0, ioctx.operate(bucket_oid, op));
  
    string name;
diff --git a/ceph/src/test/compressor/CMakeLists.txt b/ceph/src/test/compressor/CMakeLists.txt

index d296cdf0bea441d5650d4c7716bb8164de42d21e..46387c93541128be216c104cefe427dc9dd94a20 100644 (file)
--- a/ceph/src/test/compressor/CMakeLists.txt
+++ b/ceph/src/test/compressor/CMakeLists.txt
@@ -6,5 +6,5 @@ add_executable(unittest_compression
    $<TARGET_OBJECTS:unit-main>
    )
  add_ceph_unittest(unittest_compression ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_compression)
-target_link_libraries(unittest_compression global)
+target_link_libraries(unittest_compression global gtest)
  add_dependencies(unittest_compression ceph_example)
diff --git a/ceph/src/test/encoding/readable.sh b/ceph/src/test/encoding/readable.sh

index 5f546918788c1e04ad3b9f052b734b00b3bce053..c0ee9d721fe4775f166dd7e01967bd0bea744990 100755 (executable)
--- a/ceph/src/test/encoding/readable.sh
+++ b/ceph/src/test/encoding/readable.sh
@@ -232,5 +232,11 @@ if [ $failed -gt 0 ]; then
    echo "FAILED $failed / $numtests tests."
    exit 1
  fi
+
+if [ $numtests -eq 0 ]; then
+  echo "FAILED: no tests found to run!"
+  exit 1
+fi
+
  echo "passed $numtests tests."
  
diff --git a/ceph/src/test/librados/aio.cc b/ceph/src/test/librados/aio.cc

index 142622597386eaa510bfba80d79e430775268dfa..6671a42f177056a4ddbf9ea07263d2727c9ca57c 100644 (file)
--- a/ceph/src/test/librados/aio.cc
+++ b/ceph/src/test/librados/aio.cc
@@ -8,7 +8,6 @@
  #include "gtest/gtest.h"
  #include <errno.h>
  #include <fcntl.h>
-#include <semaphore.h>
  #include <sstream>
  #include <string>
  #include <boost/scoped_ptr.hpp>
@@ -24,9 +23,7 @@ public:
    AioTestData()
      : m_cluster(NULL),
        m_ioctx(NULL),
-      m_init(false),
-      m_complete(false),
-      m_safe(false)
+      m_init(false)
    {
    }
  
@@ -35,30 +32,21 @@ public:
      if (m_init) {
        rados_ioctx_destroy(m_ioctx);
        destroy_one_pool(m_pool_name, &m_cluster);
-      sem_close(m_sem);
      }
    }
  
    std::string init()
    {
      int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
      m_pool_name = get_temp_pool_name();
      std::string err = create_one_pool(m_pool_name, &m_cluster);
      if (!err.empty()) {
-      sem_close(m_sem);
        ostringstream oss;
        oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
        return oss.str();
      }
      ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
      if (ret) {
-      sem_close(m_sem);
        destroy_one_pool(m_pool_name, &m_cluster);
        ostringstream oss;
        oss << "rados_ioctx_create failed: error " << ret;
@@ -68,22 +56,17 @@ public:
      return "";
    }
  
-  sem_t *m_sem = nullptr;
    rados_t m_cluster;
    rados_ioctx_t m_ioctx;
    std::string m_pool_name;
    bool m_init;
-  bool m_complete;
-  bool m_safe;
  };
  
  class AioTestDataPP
  {
  public:
    AioTestDataPP()
-    : m_init(false),
-      m_complete(false),
-      m_safe(false)
+    : m_init(false)
    {
    }
  
@@ -92,7 +75,6 @@ public:
      if (m_init) {
        m_ioctx.close();
        destroy_one_pool_pp(m_pool_name, m_cluster);
-      sem_close(m_sem);
      }
    }
  
@@ -105,23 +87,15 @@ public:
    {
      int ret;
  
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
      m_pool_name = get_temp_pool_name();
      std::string err = create_one_pool_pp(m_pool_name, m_cluster, config);
      if (!err.empty()) {
-      sem_close(m_sem);
        ostringstream oss;
        oss << "create_one_pool(" << m_pool_name << ") failed: error " << err;
        return oss.str();
      }
      ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
      if (ret) {
-      sem_close(m_sem);
        destroy_one_pool_pp(m_pool_name, m_cluster);
        ostringstream oss;
        oss << "rados_ioctx_create failed: error " << ret;
@@ -131,49 +105,18 @@ public:
      return "";
    }
  
-  sem_t *m_sem = nullptr;
    Rados m_cluster;
    IoCtx m_ioctx;
    std::string m_pool_name;
    bool m_init;
-  bool m_complete;
-  bool m_safe;
  };
  
-void set_completion_complete(rados_completion_t cb, void *arg)
-{
-  AioTestData *test = static_cast<AioTestData*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safe(rados_completion_t cb, void *arg)
-{
-  AioTestData *test = static_cast<AioTestData*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_completePP(rados_completion_t cb, void *arg)
-{
-  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safePP(rados_completion_t cb, void *arg)
-{
-  AioTestDataPP *test = static_cast<AioTestDataPP*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
  TEST(LibRadosAio, TooBig) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(-E2BIG, rados_aio_write(test_data.m_ioctx, "foo",
@@ -191,7 +134,7 @@ TEST(LibRadosAio, TooBigPP) {
  
    bufferlist bl;
    AioCompletion *aio_completion = test_data.m_cluster.aio_create_completion(
-                                                                            (void*)&test_data, NULL, NULL);
+                                                                            nullptr, NULL, NULL);
    ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_write("foo", aio_completion, bl, UINT_MAX, 0));
    ASSERT_EQ(-E2BIG, test_data.m_ioctx.aio_append("foo", aio_completion, bl, UINT_MAX));
    // ioctx.aio_write_full no way to overflow bl.length()
@@ -258,29 +201,27 @@ TEST(LibRadosAio, SimpleWrite) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
  
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion2, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    rados_aio_release(my_completion);
@@ -296,15 +237,14 @@ TEST(LibRadosAio, SimpleWritePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
                                my_completion, bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    delete my_completion;
@@ -315,13 +255,12 @@ TEST(LibRadosAio, SimpleWritePP) {
    ASSERT_EQ("", test_data.init());
    test_data.m_ioctx.set_namespace("nspace");
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
                                my_completion, bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    delete my_completion;
@@ -332,8 +271,8 @@ TEST(LibRadosAio, WaitForSafe) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -348,7 +287,7 @@ TEST(LibRadosAio, WaitForSafePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -367,23 +306,22 @@ TEST(LibRadosAio, RoundTrip) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[256];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -400,23 +338,22 @@ TEST(LibRadosAio, RoundTrip2) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -433,8 +370,8 @@ TEST(LibRadosAio, RoundTrip3) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
  
@@ -447,8 +384,7 @@ TEST(LibRadosAio, RoundTrip3) {
  
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
  
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
@@ -457,8 +393,8 @@ TEST(LibRadosAio, RoundTrip3) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
  
    rados_read_op_t op2 = rados_create_read_op();
    rados_read_op_read(op2, 0, sizeof(buf2), buf2, NULL, NULL);
@@ -493,7 +429,7 @@ TEST(LibRadosAio, RoundTripPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -504,13 +440,12 @@ TEST(LibRadosAio, RoundTripPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion2, &bl2, sizeof(buf), 0));
@@ -529,7 +464,7 @@ TEST(LibRadosAio, RoundTripPP2) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -540,13 +475,12 @@ TEST(LibRadosAio, RoundTripPP2) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion2, &bl2, sizeof(buf), 0));
@@ -621,7 +555,7 @@ TEST(LibRadosAio, RoundTripSparseReadPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -632,14 +566,13 @@ TEST(LibRadosAio, RoundTripSparseReadPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    std::map<uint64_t, uint64_t> extents;
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completePP, set_completion_safePP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
                               my_completion2, &extents, &bl2, sizeof(buf), 0));
@@ -657,8 +590,8 @@ TEST(LibRadosAio, RoundTripAppend) {
    AioTestData test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
@@ -670,8 +603,8 @@ TEST(LibRadosAio, RoundTripAppend) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[128];
    memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
                                my_completion2, buf2, sizeof(buf2)));
    {
@@ -681,8 +614,8 @@ TEST(LibRadosAio, RoundTripAppend) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    char buf3[sizeof(buf) + sizeof(buf2)];
    memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, buf3, sizeof(buf3), 0));
    {
@@ -701,7 +634,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -720,7 +653,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
    bufferlist bl2;
    bl2.append(buf2, sizeof(buf2));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
                                             bl2, sizeof(buf2)));
@@ -731,7 +664,7 @@ TEST(LibRadosAio, RoundTripAppendPP) {
    ASSERT_EQ(0, my_completion2->get_return_value());
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion3, &bl3, 2 * sizeof(buf), 0));
@@ -754,8 +687,8 @@ TEST(LibRadosAio, RemoveTest) {
    rados_completion_t my_completion;
    AioTestData test_data;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    memset(buf, 0xaa, sizeof(buf));
    ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
    ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion));
@@ -779,7 +712,7 @@ TEST(LibRadosAioPP, RemoveTestPP) {
    ASSERT_EQ(0, test_data.m_ioctx.append("foo", bl1, sizeof(buf)));
    boost::scoped_ptr<AioCompletion> my_completion
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion.get()));
    {
      TestAlarm alarm;
@@ -801,8 +734,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
    ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));
    // async getxattr
    rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion, attr1, buf, sizeof(buf)));
    {
      TestAlarm alarm;
@@ -812,8 +745,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
    rados_aio_release(my_completion);
    // async setxattr
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion2, attr1, attr1_buf, sizeof(attr1_buf)));
    {
      TestAlarm alarm;
@@ -823,8 +756,8 @@ TEST(LibRadosAio, XattrsRoundTrip) {
    rados_aio_release(my_completion2);
    // async getxattr
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
    {
      TestAlarm alarm;
@@ -850,7 +783,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
    // async getxattr
    boost::scoped_ptr<AioCompletion> my_completion
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion.get(), attr1, bl2));
    {
      TestAlarm alarm;
@@ -865,7 +798,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
    ASSERT_EQ("", test_data2.init());
    boost::scoped_ptr<AioCompletion> my_completion2
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion2.get(), attr1, bl3));
    {
      TestAlarm alarm;
@@ -878,7 +811,7 @@ TEST(LibRadosAioPP, XattrsRoundTripPP) {
    ASSERT_EQ("", test_data3.init());
    boost::scoped_ptr<AioCompletion> my_completion3
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl4));
    {
      TestAlarm alarm;
@@ -900,8 +833,8 @@ TEST(LibRadosAio, RmXattr) {
    ASSERT_EQ(0, rados_append(test_data.m_ioctx, "foo", buf, sizeof(buf)));  
    // async setxattr
    rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo", my_completion, attr1, attr1_buf, sizeof(attr1_buf)));
    {
      TestAlarm alarm;
@@ -911,8 +844,8 @@ TEST(LibRadosAio, RmXattr) {
    rados_aio_release(my_completion);
    // async rmxattr
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo", my_completion2, attr1));
    {
      TestAlarm alarm;
@@ -922,8 +855,8 @@ TEST(LibRadosAio, RmXattr) {
    rados_aio_release(my_completion2);
    // async getxattr after deletion
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_getxattr(test_data.m_ioctx, "foo", my_completion3, attr1, buf, sizeof(buf)));
    {
      TestAlarm alarm;
@@ -939,8 +872,8 @@ TEST(LibRadosAio, RmXattr) {
    ASSERT_EQ(0, rados_write(test_data.m_ioctx, "foo_rmxattr", buf2, sizeof(buf2), 0));
    // asynx setxattr
    rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion4));
    ASSERT_EQ(0, rados_aio_setxattr(test_data.m_ioctx, "foo_rmxattr", my_completion4, attr2, attr2_buf, sizeof(attr2_buf)));
    {
      TestAlarm alarm;
@@ -952,8 +885,8 @@ TEST(LibRadosAio, RmXattr) {
    ASSERT_EQ(0, rados_remove(test_data.m_ioctx, "foo_rmxattr"));
    // async rmxattr on non existing object
    rados_completion_t my_completion5;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion5));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion5));
    ASSERT_EQ(0, rados_aio_rmxattr(test_data.m_ioctx, "foo_rmxattr", my_completion5, attr2));
    {
      TestAlarm alarm;
@@ -978,7 +911,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
    bl2.append(attr1_buf, sizeof(attr1_buf));
    boost::scoped_ptr<AioCompletion> my_completion
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo", my_completion.get(), attr1, bl2));
    {
      TestAlarm alarm;
@@ -990,7 +923,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
    ASSERT_EQ("", test_data2.init());
    boost::scoped_ptr<AioCompletion> my_completion2
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data2, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo", my_completion2.get(), attr1));
    {
      TestAlarm alarm;
@@ -1002,7 +935,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
    ASSERT_EQ("", test_data3.init());
    boost::scoped_ptr<AioCompletion> my_completion3
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data3, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    bufferlist bl3;
    ASSERT_EQ(0, test_data.m_ioctx.aio_getxattr("foo", my_completion3.get(), attr1, bl3));
    {
@@ -1025,7 +958,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
    ASSERT_EQ("", test_data4.init());
    boost::scoped_ptr<AioCompletion> my_completion4
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data4, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_setxattr("foo_rmxattr", my_completion4.get(), attr2, bl22));
    {
      TestAlarm alarm;
@@ -1039,7 +972,7 @@ TEST(LibRadosAioPP, RmXattrPP) {
    ASSERT_EQ("", test_data5.init());
    boost::scoped_ptr<AioCompletion> my_completion5
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data5, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_rmxattr("foo_rmxattr", my_completion5.get(), attr2));
    {
      TestAlarm alarm;
@@ -1066,8 +999,8 @@ TEST(LibRadosAio, XattrIter) {
    ASSERT_EQ(0, rados_setxattr(test_data.m_ioctx, "foo", attr2, attr2_buf, sizeof(attr2_buf)));
    // call async version of getxattrs and wait for completion
    rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion));
    rados_xattrs_iter_t iter;
    ASSERT_EQ(0, rados_aio_getxattrs(test_data.m_ioctx, "foo", my_completion, &iter));
    {
@@ -1126,7 +1059,7 @@ TEST(LibRadosIoPP, XattrListPP) {
    // call async version of getxattrs
    boost::scoped_ptr<AioCompletion> my_completion
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    std::map<std::string, bufferlist> attrset;
    ASSERT_EQ(0, test_data.m_ioctx.aio_getxattrs("foo", my_completion.get(), attrset));
    {
@@ -1152,23 +1085,22 @@ TEST(LibRadosAio, IsComplete) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -1192,7 +1124,7 @@ TEST(LibRadosAio, IsCompletePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1203,13 +1135,12 @@ TEST(LibRadosAio, IsCompletePP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -1235,8 +1166,8 @@ TEST(LibRadosAio, IsSafe) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1256,8 +1187,8 @@ TEST(LibRadosAio, IsSafe) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -1274,7 +1205,7 @@ TEST(LibRadosAio, IsSafePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1296,7 +1227,7 @@ TEST(LibRadosAio, IsSafePP) {
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    bufferlist bl2;
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
@@ -1316,8 +1247,8 @@ TEST(LibRadosAio, ReturnValue) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0, sizeof(buf));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
@@ -1334,7 +1265,7 @@ TEST(LibRadosAio, ReturnValuePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    bufferlist bl1;
@@ -1352,8 +1283,8 @@ TEST(LibRadosAio, Flush) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xee, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1363,8 +1294,8 @@ TEST(LibRadosAio, Flush) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -1381,7 +1312,7 @@ TEST(LibRadosAio, FlushPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1394,7 +1325,7 @@ TEST(LibRadosAio, FlushPP) {
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -1413,8 +1344,8 @@ TEST(LibRadosAio, FlushAsync) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    rados_completion_t flush_completion;
    ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
    char buf[128];
@@ -1435,8 +1366,8 @@ TEST(LibRadosAio, FlushAsync) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -1454,7 +1385,7 @@ TEST(LibRadosAio, FlushAsyncPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *flush_completion =
        test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
    AioCompletion *my_completion_null = NULL;
@@ -1478,7 +1409,7 @@ TEST(LibRadosAio, FlushAsyncPP) {
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -1498,8 +1429,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
    AioTestData test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1511,8 +1442,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[64];
    memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
                                my_completion2, buf2, sizeof(buf2)));
    {
@@ -1522,8 +1453,8 @@ TEST(LibRadosAio, RoundTripWriteFull) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    char buf3[sizeof(buf) + sizeof(buf2)];
    memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, buf3, sizeof(buf3), 0));
    {
@@ -1541,7 +1472,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1560,7 +1491,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
    bufferlist bl2;
    bl2.append(buf2, sizeof(buf2));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
    {
@@ -1570,7 +1501,7 @@ TEST(LibRadosAio, RoundTripWriteFullPP) {
    ASSERT_EQ(0, my_completion2->get_return_value());
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
                                           &bl3, sizeof(buf), 0));
@@ -1632,8 +1563,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
    AioTestData test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char full[128];
    memset(full, 0xcc, sizeof(full));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -1647,8 +1578,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
    char buf[32];
    size_t ws_write_len = sizeof(full);
    memset(buf, 0xdd, sizeof(buf));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_writesame(test_data.m_ioctx, "foo",
                                    my_completion2, buf, sizeof(buf),
                                    ws_write_len, 0));
@@ -1657,8 +1588,8 @@ TEST(LibRadosAio, RoundTripWriteSame) {
      ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, full, sizeof(full), 0));
    {
@@ -1678,7 +1609,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char full[128];
@@ -1699,7 +1630,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
    bufferlist bl2;
    bl2.append(buf, sizeof(buf));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_writesame("foo", my_completion2, bl2,
                                                ws_write_len, 0));
@@ -1710,7 +1641,7 @@ TEST(LibRadosAio, RoundTripWriteSamePP) {
    ASSERT_EQ(0, my_completion2->get_return_value());
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
                                           &bl3, sizeof(full), 0));
@@ -1782,23 +1713,22 @@ TEST(LibRadosAio, SimpleStat) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -1815,7 +1745,7 @@ TEST(LibRadosAio, SimpleStatPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1826,14 +1756,13 @@ TEST(LibRadosAio, SimpleStatPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -1851,37 +1780,35 @@ TEST(LibRadosAio, SimpleStatNS) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    char buf2[64];
    memset(buf2, 0xbb, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf2, sizeof(buf2), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
    rados_ioctx_set_namespace(test_data.m_ioctx, "");
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -1893,8 +1820,8 @@ TEST(LibRadosAio, SimpleStatNS) {
  
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion3, &psize, &pmtime));
    {
@@ -1913,7 +1840,7 @@ TEST(LibRadosAio, SimpleStatPPNS) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -1924,14 +1851,13 @@ TEST(LibRadosAio, SimpleStatPPNS) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -1949,23 +1875,22 @@ TEST(LibRadosAio, StatRemove) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -1975,8 +1900,8 @@ TEST(LibRadosAio, StatRemove) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    ASSERT_EQ(sizeof(buf), psize);
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
    {
      TestAlarm alarm;
@@ -1986,8 +1911,8 @@ TEST(LibRadosAio, StatRemove) {
    uint64_t psize2;
    time_t pmtime2;
    rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion4));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion4, &psize2, &pmtime2));
    {
@@ -2005,7 +1930,7 @@ TEST(LibRadosAio, StatRemovePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2016,14 +1941,13 @@ TEST(LibRadosAio, StatRemovePP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -2036,7 +1960,7 @@ TEST(LibRadosAio, StatRemovePP) {
    uint64_t psize2;
    time_t pmtime2;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
    {
@@ -2046,7 +1970,7 @@ TEST(LibRadosAio, StatRemovePP) {
    ASSERT_EQ(0, my_completion3->get_return_value());
  
    AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion4, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
                                         &psize2, &pmtime2));
@@ -2065,21 +1989,19 @@ TEST(LibRadosAio, ExecuteClass) {
    AioTestData test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    }
-  ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    char out[128];
    ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
                               "hello", "say_hello", NULL, 0, out, sizeof(out)));
@@ -2097,7 +2019,7 @@ TEST(LibRadosAio, ExecuteClassPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2108,12 +2030,11 @@ TEST(LibRadosAio, ExecuteClassPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    bufferlist in, out;
    ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
@@ -2337,8 +2258,8 @@ TEST(LibRadosAio, MultiWrite) {
    AioTestData test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -2351,8 +2272,8 @@ TEST(LibRadosAio, MultiWrite) {
  
    char buf2[64];
    memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion2, buf2, sizeof(buf2), sizeof(buf)));
    {
@@ -2363,8 +2284,8 @@ TEST(LibRadosAio, MultiWrite) {
  
    char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
    memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_complete, set_completion_safe, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, buf3, sizeof(buf3), 0));
    {
@@ -2383,7 +2304,7 @@ TEST(LibRadosAio, MultiWritePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2403,7 +2324,7 @@ TEST(LibRadosAio, MultiWritePP) {
    bufferlist bl2;
    bl2.append(buf2, sizeof(buf2));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
                                            bl2, sizeof(buf2), sizeof(buf)));
@@ -2415,7 +2336,7 @@ TEST(LibRadosAio, MultiWritePP) {
  
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
                                           &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
@@ -2437,8 +2358,8 @@ TEST(LibRadosAio, AioUnlock) {
    ASSERT_EQ("", test_data.init());
    ASSERT_EQ(0, rados_lock_exclusive(test_data.m_ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
    rados_completion_t my_completion;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-            set_completion_complete, set_completion_safe, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+            nullptr, nullptr, &my_completion));
    ASSERT_EQ(0, rados_aio_unlock(test_data.m_ioctx, "foo", "TestLock", "Cookie", my_completion));
    {
      TestAlarm alarm;
@@ -2454,7 +2375,7 @@ TEST(LibRadosAio, AioUnlockPP) {
    ASSERT_EQ(0, test_data.m_ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
    boost::scoped_ptr<AioCompletion> my_completion
      (test_data.m_cluster.aio_create_completion
-     ((void*)&test_data, set_completion_completePP, set_completion_safePP));
+     (nullptr, nullptr, nullptr));
    ASSERT_EQ(0, test_data.m_ioctx.aio_unlock("foo", "TestLock", "Cookie", my_completion.get()));
    {
      TestAlarm alarm;
@@ -2472,9 +2393,7 @@ public:
    AioTestDataEC()
      : m_cluster(NULL),
        m_ioctx(NULL),
-      m_init(false),
-      m_complete(false),
-      m_safe(false)
+      m_init(false)
    {
    }
  
@@ -2483,30 +2402,21 @@ public:
      if (m_init) {
        rados_ioctx_destroy(m_ioctx);
        destroy_one_ec_pool(m_pool_name, &m_cluster);
-      sem_close(m_sem);
      }
    }
  
    std::string init()
    {
      int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
      m_pool_name = get_temp_pool_name();
      std::string err = create_one_ec_pool(m_pool_name, &m_cluster);
      if (!err.empty()) {
-      sem_close(m_sem);
        ostringstream oss;
        oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
        return oss.str();
      }
      ret = rados_ioctx_create(m_cluster, m_pool_name.c_str(), &m_ioctx);
      if (ret) {
-      sem_close(m_sem);
        destroy_one_ec_pool(m_pool_name, &m_cluster);
        ostringstream oss;
        oss << "rados_ioctx_create failed: error " << ret;
@@ -2516,22 +2426,17 @@ public:
      return "";
    }
  
-  sem_t *m_sem;
    rados_t m_cluster;
    rados_ioctx_t m_ioctx;
    std::string m_pool_name;
    bool m_init;
-  bool m_complete;
-  bool m_safe;
  };
  
  class AioTestDataECPP
  {
  public:
    AioTestDataECPP()
-    : m_init(false),
-      m_complete(false),
-      m_safe(false)
+    : m_init(false)
    {
    }
  
@@ -2540,30 +2445,21 @@ public:
      if (m_init) {
        m_ioctx.close();
        destroy_one_ec_pool_pp(m_pool_name, m_cluster);
-      sem_close(m_sem);
      }
    }
  
    std::string init()
    {
      int ret;
-    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
-      int err = errno;
-      ostringstream oss;
-      oss << "sem_open failed: " << cpp_strerror(err);
-      return oss.str();
-    }
      m_pool_name = get_temp_pool_name();
      std::string err = create_one_ec_pool_pp(m_pool_name, m_cluster);
      if (!err.empty()) {
-      sem_close(m_sem);
        ostringstream oss;
        oss << "create_one_ec_pool(" << m_pool_name << ") failed: error " << err;
        return oss.str();
      }
      ret = m_cluster.ioctx_create(m_pool_name.c_str(), m_ioctx);
      if (ret) {
-      sem_close(m_sem);
        destroy_one_ec_pool_pp(m_pool_name, m_cluster);
        ostringstream oss;
        oss << "rados_ioctx_create failed: error " << ret;
@@ -2573,70 +2469,37 @@ public:
      return "";
    }
  
-  sem_t *m_sem = nullptr;
    Rados m_cluster;
    IoCtx m_ioctx;
    std::string m_pool_name;
    bool m_init;
-  bool m_complete;
-  bool m_safe;
  };
  
-void set_completion_completeEC(rados_completion_t cb, void *arg)
-{
-  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safeEC(rados_completion_t cb, void *arg)
-{
-  AioTestDataEC *test = static_cast<AioTestDataEC*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_completeECPP(rados_completion_t cb, void *arg)
-{
-  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
-  test->m_complete = true;
-  sem_post(test->m_sem);
-}
-
-void set_completion_safeECPP(rados_completion_t cb, void *arg)
-{
-  AioTestDataECPP *test = static_cast<AioTestDataECPP*>(arg);
-  test->m_safe = true;
-  sem_post(test->m_sem);
-}
-
  TEST(LibRadosAioEC, SimpleWrite) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
  
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion2, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion2));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    rados_aio_release(my_completion);
@@ -2652,15 +2515,14 @@ TEST(LibRadosAioEC, SimpleWritePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
                                my_completion, bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    delete my_completion;
@@ -2671,13 +2533,12 @@ TEST(LibRadosAioEC, SimpleWritePP) {
    ASSERT_EQ("", test_data.init());
    test_data.m_ioctx.set_namespace("nspace");
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo",
                                my_completion, bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    delete my_completion;
@@ -2688,8 +2549,8 @@ TEST(LibRadosAioEC, WaitForSafe) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -2704,7 +2565,7 @@ TEST(LibRadosAioEC, WaitForSafePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2723,23 +2584,22 @@ TEST(LibRadosAioEC, RoundTrip) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[256];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -2756,23 +2616,22 @@ TEST(LibRadosAioEC, RoundTrip2) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -2789,7 +2648,7 @@ TEST(LibRadosAioEC, RoundTripPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2800,13 +2659,12 @@ TEST(LibRadosAioEC, RoundTripPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion2, &bl2, sizeof(buf), 0));
@@ -2825,7 +2683,7 @@ TEST(LibRadosAioEC, RoundTripPP2) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2836,13 +2694,12 @@ TEST(LibRadosAioEC, RoundTripPP2) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion2, &bl2, sizeof(buf), 0));
@@ -2904,7 +2761,7 @@ TEST(LibRadosAioEC, RoundTripSparseReadPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -2915,15 +2772,14 @@ TEST(LibRadosAioEC, RoundTripSparseReadPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
  
    map<uint64_t, uint64_t> extents;
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeECPP, set_completion_safeECPP);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_sparse_read("foo",
                               my_completion2, &extents, &bl2, sizeof(buf), 0));
@@ -2941,8 +2797,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
    AioTestDataEC test_data;
    rados_completion_t my_completion, my_completion2, my_completion3, my_completion4;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    int requires;
    ASSERT_EQ(0, rados_ioctx_pool_requires_alignment2(test_data.m_ioctx, &requires));
    ASSERT_NE(0, requires);
@@ -2964,8 +2820,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
    int hbsize = bsize / 2;
    char *buf2 = (char *)new char[hbsize];
    memset(buf2, 0xdd, hbsize);
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
                                my_completion2, buf2, hbsize));
    {
@@ -2974,8 +2830,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
  
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_append(test_data.m_ioctx, "foo",
                                my_completion3, buf2, hbsize));
    {
@@ -2987,8 +2843,8 @@ TEST(LibRadosAioEC, RoundTripAppend) {
    int tbsize = bsize + hbsize;
    char *buf3 = (char *)new char[tbsize];
    memset(buf3, 0, tbsize);
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion4));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion4, buf3, bsize * 3, 0));
    {
@@ -3011,7 +2867,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    bool requires;
@@ -3039,7 +2895,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
    bufferlist bl2;
    bl2.append(buf2, hbsize);
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion2,
                                             bl2, hbsize));
@@ -3050,7 +2906,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
    ASSERT_EQ(0, my_completion2->get_return_value());
  
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_append("foo", my_completion3,
                                             bl2, hbsize));
@@ -3062,7 +2918,7 @@ TEST(LibRadosAioEC, RoundTripAppendPP) {
  
    bufferlist bl3;
    AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion4, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo",
                               my_completion4, &bl3, bsize * 3, 0));
@@ -3086,23 +2942,22 @@ TEST(LibRadosAioEC, IsComplete) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -3126,7 +2981,7 @@ TEST(LibRadosAioEC, IsCompletePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3137,13 +2992,12 @@ TEST(LibRadosAioEC, IsCompletePP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -3169,8 +3023,8 @@ TEST(LibRadosAioEC, IsSafe) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3190,8 +3044,8 @@ TEST(LibRadosAioEC, IsSafe) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -3208,7 +3062,7 @@ TEST(LibRadosAioEC, IsSafePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3230,7 +3084,7 @@ TEST(LibRadosAioEC, IsSafePP) {
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    bufferlist bl2;
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
@@ -3250,8 +3104,8 @@ TEST(LibRadosAioEC, ReturnValue) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0, sizeof(buf));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
@@ -3268,7 +3122,7 @@ TEST(LibRadosAioEC, ReturnValuePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    bufferlist bl1;
@@ -3286,8 +3140,8 @@ TEST(LibRadosAioEC, Flush) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xee, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3297,8 +3151,8 @@ TEST(LibRadosAioEC, Flush) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -3315,7 +3169,7 @@ TEST(LibRadosAioEC, FlushPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3328,7 +3182,7 @@ TEST(LibRadosAioEC, FlushPP) {
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -3347,8 +3201,8 @@ TEST(LibRadosAioEC, FlushAsync) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    rados_completion_t flush_completion;
    ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &flush_completion));
    char buf[128];
@@ -3369,8 +3223,8 @@ TEST(LibRadosAioEC, FlushAsync) {
    char buf2[128];
    memset(buf2, 0, sizeof(buf2));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion2, buf2, sizeof(buf2), 0));
    {
@@ -3388,7 +3242,7 @@ TEST(LibRadosAioEC, FlushAsyncPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *flush_completion =
        test_data.m_cluster.aio_create_completion(NULL, NULL, NULL);
    AioCompletion *my_completion_null = NULL;
@@ -3412,7 +3266,7 @@ TEST(LibRadosAioEC, FlushAsyncPP) {
    ASSERT_EQ(0, my_completion->get_return_value());
    bufferlist bl2;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion2,
                                           &bl2, sizeof(buf), 0));
@@ -3432,8 +3286,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
    AioTestDataEC test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3445,8 +3299,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    char buf2[64];
    memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write_full(test_data.m_ioctx, "foo",
                                my_completion2, buf2, sizeof(buf2)));
    {
@@ -3456,8 +3310,8 @@ TEST(LibRadosAioEC, RoundTripWriteFull) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    char buf3[sizeof(buf) + sizeof(buf2)];
    memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, buf3, sizeof(buf3), 0));
    {
@@ -3475,7 +3329,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3494,7 +3348,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
    bufferlist bl2;
    bl2.append(buf2, sizeof(buf2));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write_full("foo", my_completion2, bl2));
    {
@@ -3504,7 +3358,7 @@ TEST(LibRadosAioEC, RoundTripWriteFullPP) {
    ASSERT_EQ(0, my_completion2->get_return_value());
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
                                           &bl3, sizeof(buf), 0));
@@ -3566,23 +3420,22 @@ TEST(LibRadosAioEC, SimpleStat) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -3599,7 +3452,7 @@ TEST(LibRadosAioEC, SimpleStatPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3610,14 +3463,13 @@ TEST(LibRadosAioEC, SimpleStatPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -3635,37 +3487,35 @@ TEST(LibRadosAioEC, SimpleStatNS) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    char buf2[64];
    memset(buf2, 0xbb, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf2, sizeof(buf2), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
    rados_ioctx_set_namespace(test_data.m_ioctx, "");
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -3677,8 +3527,8 @@ TEST(LibRadosAioEC, SimpleStatNS) {
  
    rados_ioctx_set_namespace(test_data.m_ioctx, "nspace");
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion3, &psize, &pmtime));
    {
@@ -3697,7 +3547,7 @@ TEST(LibRadosAioEC, SimpleStatPPNS) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3708,14 +3558,13 @@ TEST(LibRadosAioEC, SimpleStatPPNS) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -3733,23 +3582,22 @@ TEST(LibRadosAioEC, StatRemove) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    uint64_t psize;
    time_t pmtime;
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion2, &psize, &pmtime));
    {
@@ -3759,8 +3607,8 @@ TEST(LibRadosAioEC, StatRemove) {
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion2));
    ASSERT_EQ(sizeof(buf), psize);
    rados_completion_t my_completion3;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_remove(test_data.m_ioctx, "foo", my_completion3));
    {
      TestAlarm alarm;
@@ -3770,8 +3618,8 @@ TEST(LibRadosAioEC, StatRemove) {
    uint64_t psize2;
    time_t pmtime2;
    rados_completion_t my_completion4;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion4));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion4));
    ASSERT_EQ(0, rados_aio_stat(test_data.m_ioctx, "foo",
                               my_completion4, &psize2, &pmtime2));
    {
@@ -3789,7 +3637,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3800,14 +3648,13 @@ TEST(LibRadosAioEC, StatRemovePP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    uint64_t psize;
    time_t pmtime;
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion2,
                                         &psize, &pmtime));
@@ -3820,7 +3667,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
    uint64_t psize2;
    time_t pmtime2;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion3));
    {
@@ -3830,7 +3677,7 @@ TEST(LibRadosAioEC, StatRemovePP) {
    ASSERT_EQ(0, my_completion3->get_return_value());
  
    AioCompletion *my_completion4 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion4, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_stat("foo", my_completion4,
                                         &psize2, &pmtime2));
@@ -3849,21 +3696,20 @@ TEST(LibRadosAioEC, ExecuteClass) {
    AioTestDataEC test_data;
    rados_completion_t my_completion;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion, buf, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, rados_aio_wait_for_complete(my_completion));
    }
    ASSERT_EQ(0, rados_aio_get_return_value(my_completion));
    rados_completion_t my_completion2;
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    char out[128];
    ASSERT_EQ(0, rados_aio_exec(test_data.m_ioctx, "foo", my_completion2,
                               "hello", "say_hello", NULL, 0, out, sizeof(out)));
@@ -3881,7 +3727,7 @@ TEST(LibRadosAioEC, ExecuteClassPP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -3892,12 +3738,11 @@ TEST(LibRadosAioEC, ExecuteClassPP) {
                                            bl1, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
+    ASSERT_EQ(0, my_completion->wait_for_complete());
    }
    ASSERT_EQ(0, my_completion->get_return_value());
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    bufferlist in, out;
    ASSERT_EQ(0, test_data.m_ioctx.aio_exec("foo", my_completion2,
@@ -3949,8 +3794,8 @@ TEST(LibRadosAioEC, MultiWrite) {
    AioTestDataEC test_data;
    rados_completion_t my_completion, my_completion2, my_completion3;
    ASSERT_EQ("", test_data.init());
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion));
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
@@ -3963,8 +3808,8 @@ TEST(LibRadosAioEC, MultiWrite) {
  
    char buf2[64];
    memset(buf2, 0xdd, sizeof(buf2));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion2));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion2));
    ASSERT_EQ(0, rados_aio_write(test_data.m_ioctx, "foo",
                                my_completion2, buf2, sizeof(buf2), sizeof(buf)));
    {
@@ -3975,8 +3820,8 @@ TEST(LibRadosAioEC, MultiWrite) {
  
    char buf3[(sizeof(buf) + sizeof(buf2)) * 3];
    memset(buf3, 0, sizeof(buf3));
-  ASSERT_EQ(0, rados_aio_create_completion((void*)&test_data,
-             set_completion_completeEC, set_completion_safeEC, &my_completion3));
+  ASSERT_EQ(0, rados_aio_create_completion(nullptr,
+             nullptr, nullptr, &my_completion3));
    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "foo",
                               my_completion3, buf3, sizeof(buf3), 0));
    {
@@ -3994,7 +3839,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
    AioTestDataECPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char buf[128];
@@ -4014,7 +3859,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
    bufferlist bl2;
    bl2.append(buf2, sizeof(buf2));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion2,
                                            bl2, sizeof(buf2), sizeof(buf)));
@@ -4026,7 +3871,7 @@ TEST(LibRadosAioEC, MultiWritePP) {
  
    bufferlist bl3;
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_completeEC, set_completion_safeEC);
+         nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion3, my_completion_null);
    ASSERT_EQ(0, test_data.m_ioctx.aio_read("foo", my_completion3,
                                           &bl3, (sizeof(buf) + sizeof(buf2) * 3), 0));
@@ -4046,22 +3891,20 @@ TEST(LibRadosAio, RacingRemovePP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init({{"objecter_retry_writes_after_first_reply", "true"}}));
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-        (void*)&test_data, set_completion_complete, set_completion_safe);
+        nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion, nullptr);
    char buf[128];
    memset(buf, 0xcc, sizeof(buf));
    bufferlist bl;
    bl.append(buf, sizeof(buf));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-        (void*)&test_data, set_completion_complete, set_completion_safe);
+        nullptr, nullptr, nullptr);
    ASSERT_NE(my_completion2, nullptr);
    ASSERT_EQ(0, test_data.m_ioctx.aio_remove("foo", my_completion2));
    ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
                                           bl, sizeof(buf), 0));
    {
      TestAlarm alarm;
-    sem_wait(test_data.m_sem);
-    sem_wait(test_data.m_sem);
      my_completion2->wait_for_complete();
      my_completion->wait_for_complete();
    }
@@ -4076,7 +3919,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
    AioTestDataPP test_data;
    ASSERT_EQ("", test_data.init());
    AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    AioCompletion *my_completion_null = NULL;
    ASSERT_NE(my_completion, my_completion_null);
    char full[128];
@@ -4095,7 +3938,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
    bufferlist cbl;
    cbl.append(full, sizeof(full));
    AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion2, 0, cbl));
  
    {
@@ -4109,7 +3952,7 @@ TEST(LibRadosAio, RoundTripCmpExtPP) {
    cbl.clear();
    cbl.append(full, sizeof(full));
    AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
-         (void*)&test_data, set_completion_complete, set_completion_safe);
+         nullptr, nullptr, nullptr);
    ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion3, 0, cbl));
  
    {
diff --git a/ceph/src/test/librados/lock.cc b/ceph/src/test/librados/lock.cc

index 1a7c58380004646c8fc5192709520bd8a456daf7..3c5b985acc7733194e8dd7af08ef260d97fccb35 100644 (file)
--- a/ceph/src/test/librados/lock.cc
+++ b/ceph/src/test/librados/lock.cc
@@ -72,16 +72,16 @@ TEST_F(LibRadosLockPP, LockSharedDurPP) {
    ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
  }
  
-TEST_F(LibRadosLock, LockRenew) {
+TEST_F(LibRadosLock, LockMayRenew) {
    ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
    ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
  }
  
-TEST_F(LibRadosLockPP, LockRenewPP) {
+TEST_F(LibRadosLockPP, LockMayRenewPP) {
    ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
    ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
  }
  
  TEST_F(LibRadosLock, Unlock) {
@@ -251,16 +251,16 @@ TEST_F(LibRadosLockECPP, LockSharedDurPP) {
    ASSERT_EQ(0, ioctx.lock_shared("foo", "TestLock", "Cookie", "Tag", "", NULL, 0));
  }
  
-TEST_F(LibRadosLockEC, LockRenew) {
+TEST_F(LibRadosLockEC, LockMayRenew) {
    ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
    ASSERT_EQ(-EEXIST, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, rados_lock_exclusive(ioctx, "foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
  }
  
-TEST_F(LibRadosLockECPP, LockRenewPP) {
+TEST_F(LibRadosLockECPP, LockMayRenewPP) {
    ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
    ASSERT_EQ(-EEXIST, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, 0));
-  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_RENEW));
+  ASSERT_EQ(0, ioctx.lock_exclusive("foo", "TestLock", "Cookie", "", NULL, LOCK_FLAG_MAY_RENEW));
  }
  
  TEST_F(LibRadosLockEC, Unlock) {
diff --git a/ceph/src/test/librados_test_stub/LibradosTestStub.cc b/ceph/src/test/librados_test_stub/LibradosTestStub.cc

index c07d4b61bb28fafded813b65d7f2e8fd24fc3d97..e5e52948116da8640bd9545fbf8996dc2d329573 100644 (file)
--- a/ceph/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/ceph/src/test/librados_test_stub/LibradosTestStub.cc
@@ -1140,6 +1140,12 @@ int cls_cxx_create(cls_method_context_t hctx, bool exclusive) {
    return ctx->io_ctx_impl->create(ctx->oid, exclusive);
  }
  
+int cls_cxx_remove(cls_method_context_t hctx) {
+  librados::TestClassHandler::MethodContext *ctx =
+    reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
+  return ctx->io_ctx_impl->remove(ctx->oid, ctx->io_ctx_impl->get_snap_context());
+}
+
  int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) {
    librados::TestClassHandler::MethodContext *ctx =
      reinterpret_cast<librados::TestClassHandler::MethodContext*>(hctx);
diff --git a/ceph/src/test/objectstore/store_test.cc b/ceph/src/test/objectstore/store_test.cc

index d2152d4562087144c4b3abc4f4b944ba41a1d12f..326c9785d0354a79423e36fa3a0071e8db14b048 100644 (file)
--- a/ceph/src/test/objectstore/store_test.cc
+++ b/ceph/src/test/objectstore/store_test.cc
@@ -6855,6 +6855,73 @@ TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
    g_conf->apply_changes(NULL);
  }
  
+TEST_P(StoreTest, SpuriousReadErrorTest) {
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  ObjectStore::Sequencer osr("test");
+  int r;
+  auto logger = store->get_perf_counters();
+  coll_t cid;
+  ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    cerr << "Creating collection " << cid << std::endl;
+    r = store->queue_transaction(&osr, std::move(t), nullptr);
+    ASSERT_EQ(r, 0);
+  }
+  bufferlist test_data;
+  bufferptr ap(0x2000);
+  memset(ap.c_str(), 'a', 0x2000);
+  test_data.append(ap);
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, hoid, 0, 0x2000, test_data);
+    r = store->queue_transaction(&osr, std::move(t), nullptr);
+    ASSERT_EQ(r, 0);
+    // force cache clear
+    EXPECT_EQ(store->umount(), 0);
+    EXPECT_EQ(store->mount(), 0);
+  }
+
+  cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
+  g_conf->set_val("bluestore_retry_disk_reads", "0");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "1");
+  g_ceph_context->_conf->apply_changes(nullptr);
+  {
+    bufferlist in;
+    r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+    ASSERT_EQ(-EIO, r);
+    ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
+    ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
+  }
+
+  cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
+  g_conf->set_val("bluestore_retry_disk_reads", "255");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0.8");
+  /**
+   * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
+   * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
+   * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
+   */
+  g_ceph_context->_conf->apply_changes(nullptr);
+  {
+    for (int i = 0; i < 25; ++i) {
+      bufferlist in;
+      r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+      ASSERT_EQ(0x2000, r);
+      ASSERT_TRUE(bl_eq(test_data, in));
+    }
+    ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
+  }
+
+  // revert
+  g_conf->set_val("bluestore_retry_disk_reads", "3");
+  g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0");
+  g_ceph_context->_conf->apply_changes(nullptr);
+}
+
  int main(int argc, char **argv) {
    vector<const char*> args;
    argv_to_vec(argc, (const char **)argv, args);
diff --git a/ceph/src/test/osd/TestOSDMap.cc b/ceph/src/test/osd/TestOSDMap.cc

index 6335a0542051098a7c4abd4e736ae952c843535d..251ae4d994064301fb5ca9b89c6451990c563350 100644 (file)
--- a/ceph/src/test/osd/TestOSDMap.cc
+++ b/ceph/src/test/osd/TestOSDMap.cc
@@ -598,6 +598,143 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
      ASSERT_TRUE(parent_0 != parent_1);
    }
  
+ {
+    // cancel stale upmaps
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    int from = -1;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (std::find(up.begin(), up.end(), i) == up.end()) {
+        from = i;
+        break;
+      }
+    }
+    ASSERT_TRUE(from >= 0);
+    int to = -1;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
+        to = i;
+        break;
+      }
+    }
+    ASSERT_TRUE(to >= 0);
+    vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+    new_pg_upmap_items.push_back(make_pair(from, to));
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+    OSDMap nextmap;
+    nextmap.deepish_copy_from(osdmap);
+    nextmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
+    OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
+    nextmap.clean_pg_upmaps(g_ceph_context, &new_pending_inc);
+    nextmap.apply_incremental(new_pending_inc);
+    ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
+  }
+
+  {
+    // https://tracker.ceph.com/issues/37493
+    pg_t ec_pg(0, my_ec_pool);
+    pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+    OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(!ec_up.empty());
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          to = i;
+          break;
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // mark one of the target OSDs of the above pg_upmap_item as down
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      pending_inc.new_state[to] = CEPH_OSD_UP;
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(!tmpmap.is_up(to));
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // confirm *maybe_remove_pg_upmaps* won't do anything bad
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      tmpmap.maybe_remove_pg_upmaps(g_ceph_context, tmpmap, &pending_inc);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+  }
+
+  {
+    // http://tracker.ceph.com/issues/37501
+    pg_t ec_pg(0, my_ec_pool);
+    pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
+    OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(!ec_up.empty());
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          to = i;
+          break;
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmpmap.deepish_copy_from(osdmap);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // mark one of the target OSDs of the above pg_upmap_item as out
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      pending_inc.new_weight[to] = CEPH_OSD_OUT;
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmpmap.is_out(to));
+      ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // *maybe_remove_pg_upmaps* should be able to remove the above *bad* mapping
+      OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
+      OSDMap nextmap;
+      nextmap.deepish_copy_from(tmpmap);
+      nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+      tmpmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
+    }
+  }
+
    {
      // TEST pg_upmap
      {
diff --git a/ceph/src/test/rgw/rgw_multi/multisite.py b/ceph/src/test/rgw/rgw_multi/multisite.py

index 58bd98224b14a734f7ea20bc55d755be2fd79a80..78e8a6b56e8481616f8815acbd0ff1da22a95120 100644 (file)
--- a/ceph/src/test/rgw/rgw_multi/multisite.py
+++ b/ceph/src/test/rgw/rgw_multi/multisite.py
@@ -1,8 +1,9 @@
  from abc import ABCMeta, abstractmethod
-from cStringIO import StringIO
+from six import StringIO
+
  import json
  
-from conn import get_gateway_connection
+from .conn import get_gateway_connection
  
  class Cluster:
      """ interface to run commands against a distinct ceph cluster """
diff --git a/ceph/src/test/rgw/rgw_multi/tests.py b/ceph/src/test/rgw/rgw_multi/tests.py

index 0807677d5d8c3834c14e925cb6215d9199ee4f5e..5b4f0a0d3caf776365bdce620337ac2e7338bb8a 100644 (file)
--- a/ceph/src/test/rgw/rgw_multi/tests.py
+++ b/ceph/src/test/rgw/rgw_multi/tests.py
@@ -4,13 +4,14 @@ import string
  import sys
  import time
  import logging
+import errno
  
  try:
      from itertools import izip_longest as zip_longest
  except ImportError:
      from itertools import zip_longest
  from itertools import combinations
-from cStringIO import StringIO
+from six import StringIO
  
  import boto
  import boto.s3.connection
@@ -21,7 +22,7 @@ from nose.tools import eq_ as eq
  from nose.plugins.attrib import attr
  from nose.plugins.skip import SkipTest
  
-from .multisite import Zone
+from .multisite import Zone, ZoneGroup
  
  from .conn import get_gateway_connection
  
@@ -849,6 +850,34 @@ def test_zonegroup_remove():
      # validate the resulting period
      zonegroup.period.update(z1, commit=True)
  
+
+def test_zg_master_zone_delete():
+
+    master_zg = realm.master_zonegroup()
+    master_zone = master_zg.master_zone
+
+    assert(len(master_zg.zones) >= 1)
+    master_cluster = master_zg.zones[0].cluster
+
+    rm_zg = ZoneGroup('remove_zg')
+    rm_zg.create(master_cluster)
+
+    rm_zone = Zone('remove', rm_zg, master_cluster)
+    rm_zone.create(master_cluster)
+    master_zg.period.update(master_zone, commit=True)
+
+
+    rm_zone.delete(master_cluster)
+    # Period update: This should now fail as the zone will be the master zone
+    # in that zg
+    _, retcode = master_zg.period.update(master_zone, check_retcode=False)
+    assert(retcode == errno.EINVAL)
+
+    # Proceed to delete the zonegroup as well, previous period now does not
+    # contain a dangling master_zone, this must succeed
+    rm_zg.delete(master_cluster)
+    master_zg.period.update(master_zone, commit=True)
+
  def test_set_bucket_website():
      buckets, zone_bucket = create_bucket_per_zone_in_realm()
      for _, bucket in zone_bucket:
diff --git a/ceph/src/test/rgw/test_rgw_iam_policy.cc b/ceph/src/test/rgw/test_rgw_iam_policy.cc

index 738ce1b78efc8e6300f4db6735ca59cd7081e29c..3bae06f47ee2af838539bd8952699c3f9f5c7237 100644 (file)
--- a/ceph/src/test/rgw/test_rgw_iam_policy.cc
+++ b/ceph/src/test/rgw/test_rgw_iam_policy.cc
@@ -942,7 +942,7 @@ TEST(MatchPolicy, Resource)
    EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
    EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
    EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
-  EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+  EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
  }
  
  TEST(MatchPolicy, ARN)
@@ -960,5 +960,5 @@ TEST(MatchPolicy, String)
    EXPECT_TRUE(match_policy("a:b:c", "a:b:c", flag));
    EXPECT_FALSE(match_policy("a:b:c", "A:B:C", flag)); // case sensitive
    EXPECT_TRUE(match_policy("a:*:e", "a:bcd:e", flag));
-  EXPECT_FALSE(match_policy("a:*", "a:b:c", flag)); // cannot span segments
+  EXPECT_TRUE(match_policy("a:*", "a:b:c", flag)); // can span segments
  }
diff --git a/ceph/src/tools/cephfs/JournalTool.cc b/ceph/src/tools/cephfs/JournalTool.cc

index a66cc2d70ba565f31b8853f2eb3898cfe466b597..a9756f73e2a9d6696b2ea567663348137909b93a 100644 (file)
--- a/ceph/src/tools/cephfs/JournalTool.cc
+++ b/ceph/src/tools/cephfs/JournalTool.cc
@@ -62,9 +62,7 @@ void JournalTool::usage()
      << "    <output>: [summary|list|binary|json] [--path <path>]\n"
      << "\n"
      << "General options:\n"
-    << "  --rank=filesystem:mds-rank  Journal rank (required if multiple\n"
-    << "                              file systems, default is rank 0 on\n"
-    << "                              the only filesystem otherwise.\n"
+    << "  --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
      << "\n"
      << "Special options\n"
      << "  --alternate-pool <name>     Alternative metadata pool to target\n"
@@ -92,13 +90,12 @@ int JournalTool::main(std::vector<const char*> &argv)
    std::vector<const char*>::iterator arg = argv.begin();
  
    std::string rank_str;
-  if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
-    // Default: act on rank 0.  Will give the user an error if they
-    // try invoking this way when they have more than one filesystem.
-    rank_str = "0";
+  if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
+    derr << "missing mandatory \"--rank\" argument" << dendl;
+    return -EINVAL;
    }
  
-  r = role_selector.parse(*fsmap, rank_str);
+  r = role_selector.parse(*fsmap, rank_str, false);
    if (r != 0) {
      derr << "Couldn't determine MDS rank." << dendl;
      return r;
@@ -145,15 +142,27 @@ int JournalTool::main(std::vector<const char*> &argv)
  
    // Execution
    // =========
-  for (auto role : role_selector.get_roles()) {
+  auto roles = role_selector.get_roles();
+  if (roles.size() > 1) {
+    const std::string &command = argv[0];
+    bool allowed = can_execute_for_all_ranks(mode, command);
+    if (!allowed) {
+      derr << "operation not allowed for all ranks" << dendl;
+      return -EINVAL;
+    }
+
+    all_ranks = true;
+  }
+  for (auto role : roles) {
      rank = role.rank;
+    std::vector<const char *> rank_argv(argv);
      dout(4) << "Executing for rank " << rank << dendl;
      if (mode == std::string("journal")) {
-      r = main_journal(argv);
+      r = main_journal(rank_argv);
      } else if (mode == std::string("header")) {
-      r = main_header(argv);
+      r = main_header(rank_argv);
      } else if (mode == std::string("event")) {
-      r = main_event(argv);
+      r = main_event(rank_argv);
      } else {
        derr << "Bad command '" << mode << "'" << dendl;
        usage();
@@ -169,6 +178,23 @@ int JournalTool::main(std::vector<const char*> &argv)
  }
  
  
+std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
+  if (!all_ranks) {
+    return prefix;
+  }
+
+  return prefix + "." + std::to_string(rank);
+}
+
+bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
+                                            const std::string &command) {
+  if (mode == "journal" && command == "import") {
+    return false;
+  }
+
+  return true;
+}
+
  /**
   * Handle arguments for 'journal' mode
   *
@@ -376,6 +402,8 @@ int JournalTool::main_event(std::vector<const char*> &argv)
      }
    }
  
+  const std::string dump_path = gen_dump_file_path(output_path);
+
    // Execute command
    // ===============
    JournalScanner js(input, rank, filter);
@@ -493,7 +521,7 @@ int JournalTool::main_event(std::vector<const char*> &argv)
  
    // Generate output
    // ===============
-  EventOutput output(js, output_path);
+  EventOutput output(js, dump_path);
    int output_result = 0;
    if (output_style == "binary") {
        output_result = output.binary();
@@ -580,7 +608,8 @@ int JournalTool::journal_export(std::string const &path, bool import, bool force
      if (import) {
        r = dumper.undump(path.c_str(), force);
      } else {
-      r = dumper.dump(path.c_str());
+      const std::string ex_path = gen_dump_file_path(path);
+      r = dumper.dump(ex_path.c_str());
      }
    }
  
diff --git a/ceph/src/tools/cephfs/JournalTool.h b/ceph/src/tools/cephfs/JournalTool.h

index 2edeb6a76535c4e2f755634cda38aa6e26b8ee93..4dccd92f12b5448b9c47dcd66c7df4028b056828 100644 (file)
--- a/ceph/src/tools/cephfs/JournalTool.h
+++ b/ceph/src/tools/cephfs/JournalTool.h
@@ -38,6 +38,9 @@ class JournalTool : public MDSUtility
      // various main_ functions.
      mds_rank_t rank;
  
+    // when set, generate per rank dump file path
+    bool all_ranks = false;
+
      // Entry points
      int main_journal(std::vector<const char*> &argv);
      int main_header(std::vector<const char*> &argv);
@@ -78,6 +81,14 @@ class JournalTool : public MDSUtility
          bufferlist *out_bl);
      int consume_inos(const std::set<inodeno_t> &inos);
  
+    // generate output file path for dump/export
+    std::string gen_dump_file_path(const std::string &prefix);
+
+    // check if an operation (mode, command) is safe to be
+    // executed on all ranks.
+    bool can_execute_for_all_ranks(const std::string &mode,
+                                   const std::string &command);
+
    public:
      void usage();
      JournalTool() :
diff --git a/ceph/src/tools/cephfs/RoleSelector.cc b/ceph/src/tools/cephfs/RoleSelector.cc

index 3c7932a5eb9e6fa50f1b97d39411ed2cb3be182d..e2d53b86ea790b21f06a8b85dde2ba5412aade66 100644 (file)
--- a/ceph/src/tools/cephfs/RoleSelector.cc
+++ b/ceph/src/tools/cephfs/RoleSelector.cc
@@ -29,13 +29,14 @@ int MDSRoleSelector::parse_rank(
    }
  }
  
-int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str)
+int MDSRoleSelector::parse(const FSMap &fsmap, std::string const &str,
+                           bool allow_unqualified_rank)
  {
    auto colon_pos = str.find(":");
    if (colon_pos == std::string::npos) {
      // An unqualified rank.  Only valid if there is only one
      // namespace.
-    if (fsmap.filesystem_count() == 1) {
+    if (fsmap.filesystem_count() == 1 && allow_unqualified_rank) {
        fscid = fsmap.get_filesystem()->fscid;
        return parse_rank(fsmap, str);
      } else {
diff --git a/ceph/src/tools/cephfs/RoleSelector.h b/ceph/src/tools/cephfs/RoleSelector.h

index 933c51d0f78fd1dacfbd45ba8e178f7168f16c8b..9090b7200fb6d9d1396b87f6068f3d35cd33611d 100644 (file)
--- a/ceph/src/tools/cephfs/RoleSelector.h
+++ b/ceph/src/tools/cephfs/RoleSelector.h
@@ -15,7 +15,8 @@ class MDSRoleSelector
  {
    public:
      const std::vector<mds_role_t> &get_roles() const {return roles;}
-    int parse(const FSMap &fsmap, std::string const &str);
+    int parse(const FSMap &fsmap, std::string const &str,
+            bool allow_unqualified_rank=true);
      MDSRoleSelector()
        : fscid(FS_CLUSTER_ID_NONE)
      {}
diff --git a/ceph/src/tools/crushtool.cc b/ceph/src/tools/crushtool.cc

index aca8136cd09326dee50cea38c7d1a42c5cae8cd0..13bf247bb5e17b81b148a6dd0e2fba9ee9f7a7ac 100644 (file)
--- a/ceph/src/tools/crushtool.cc
+++ b/ceph/src/tools/crushtool.cc
@@ -172,6 +172,8 @@ void usage()
    cout << "   -i mapfn --move       name --loc type name ...\n"
         << "                         move the given item to specified location\n";
    cout << "   -i mapfn --reweight   recalculate all bucket weights\n";
+  cout << "   -i mapfn --rebuild-class-roots\n";
+  cout << "                         rebuild the per-class shadow trees (normally a no-op)\n";
    cout << "   -i mapfn --create-simple-rule name root type mode\n"
         << "                         create crush rule <name> to start from <root>,\n"
         << "                         replicate across buckets of type <type>, using\n"
@@ -219,6 +221,13 @@ void usage()
    cout << "                         export select data generated during testing routine\n";
    cout << "                         to CSV files for off-line post-processing\n";
    cout << "                         use --help-output for more information\n";
+  cout << "   --reclassify          transform legacy CRUSH map buckets and rules\n";
+  cout << "                         by adding classes\n";
+  cout << "      --reclassify-bucket <bucket-match> <class> <default-parent>\n";
+  cout << "      --reclassify-root <bucket-name> <class>\n";
+  cout << "   --set-subtree-class <bucket-name> <class>\n";
+  cout << "                         set class for all items beneath bucket-name\n";
+  cout << "   --compare <otherfile> compare two maps using --test parameters\n";
    cout << "\n";
    cout << "Options for the output stage\n";
    cout << "\n";
@@ -341,6 +350,8 @@ int main(int argc, const char **argv)
    int verbose = 0;
    bool unsafe_tunables = false;
  
+  bool rebuild_class_roots = false;
+
    bool reweight = false;
    int add_item = -1;
    bool add_bucket = false;
@@ -368,6 +379,13 @@ int main(int argc, const char **argv)
    int straw_calc_version = -1;
    int allowed_bucket_algs = -1;
  
+  bool reclassify = false;
+  map<string,pair<string,string>> reclassify_bucket; // %suffix or prefix% -> class, default_root
+  map<string,string> reclassify_root;        // bucket -> class
+  map<string,string> set_subtree_class;     // bucket -> class
+
+  string compare;
+
    CrushWrapper crush;
  
    CrushTester tester(crush, cout);
@@ -406,6 +424,40 @@ int main(int argc, const char **argv)
        outfn = val;
      } else if (ceph_argparse_flag(args, i, "-v", "--verbose", (char*)NULL)) {
        verbose += 1;
+    } else if (ceph_argparse_witharg(args, i, &val, "--compare", (char*)NULL)) {
+      compare = val;
+    } else if (ceph_argparse_flag(args, i, "--reclassify", (char*)NULL)) {
+      reclassify = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-bucket",
+                                    (char*)NULL)) {
+      if (i == args.end()) {
+       cerr << "expecting additional argument" << std::endl;
+       return EXIT_FAILURE;
+      }
+      string c = *i;
+      i = args.erase(i);
+      if (i == args.end()) {
+       cerr << "expecting additional argument" << std::endl;
+       return EXIT_FAILURE;
+      }
+      reclassify_bucket[val] = make_pair(c, *i);
+      i = args.erase(i);
+    } else if (ceph_argparse_witharg(args, i, &val, "--reclassify-root",
+                                    (char*)NULL)) {
+      if (i == args.end()) {
+       cerr << "expecting additional argument" << std::endl;
+       return EXIT_FAILURE;
+      }
+      reclassify_root[val] = *i;
+      i = args.erase(i);
+    } else if (ceph_argparse_witharg(args, i, &val, "--set-subtree-class",
+                                    (char*)NULL)) {
+      if (i == args.end()) {
+       cerr << "expecting additional argument" << std::endl;
+       return EXIT_FAILURE;
+      }
+      set_subtree_class[val] = *i;
+      i = args.erase(i);
      } else if (ceph_argparse_flag(args, i, "--tree", (char*)NULL)) {
        tree = true;
      } else if (ceph_argparse_witharg(args, i, &val, "-f", "--format", (char*)NULL)) {
@@ -468,6 +520,8 @@ int main(int argc, const char **argv)
        adjust = true;
      } else if (ceph_argparse_flag(args, i, "--reweight", (char*)NULL)) {
        reweight = true;
+    } else if (ceph_argparse_flag(args, i, "--rebuild-class-roots", (char*)NULL)) {
+      rebuild_class_roots = true;
      } else if (ceph_argparse_witharg(args, i, &add_item, err, "--add_item", (char*)NULL)) {
        if (!err.str().empty()) {
         cerr << err.str() << std::endl;
@@ -733,7 +787,7 @@ int main(int argc, const char **argv)
      }
    }
  
-  if (test && !check && !display && !write_to_file) {
+  if (test && !check && !display && !write_to_file && compare.empty()) {
      cerr << "WARNING: no output selected; use --output-csv or --show-X" << std::endl;
    }
  
@@ -743,6 +797,8 @@ int main(int argc, const char **argv)
    }
    if (!check && !compile && !decompile && !build && !test && !reweight && !adjust && !tree && !dump &&
        add_item < 0 && !add_bucket && !move_item && !add_rule && !del_rule && full_location < 0 &&
+      !reclassify && !rebuild_class_roots &&
+      compare.empty() &&
        remove_name.empty() && reweight_name.empty()) {
      cerr << "no action specified; -h for help" << std::endl;
      return EXIT_FAILURE;
@@ -1086,7 +1142,31 @@ int main(int argc, const char **argv)
      crush.reweight(g_ceph_context);
      modified = true;
    }
+  if (rebuild_class_roots) {
+    int r = crush.rebuild_roots_with_classes();
+    if (r < 0) {
+      cerr << "failed to rebuidl roots with classes" << std::endl;
+      return EXIT_FAILURE;
+    }
+    modified = true;
+  }
  
+  for (auto& i : set_subtree_class) {
+    crush.set_subtree_class(i.first, i.second);
+    modified = true;
+  }
+  if (reclassify) {
+    int r = crush.reclassify(
+      g_ceph_context,
+      cout,
+      reclassify_root,
+      reclassify_bucket);
+    if (r < 0) {
+      cerr << "failed to reclassify map" << std::endl;
+      return EXIT_FAILURE;
+    }
+    modified = true;
+  }
  
    // display ---
    if (full_location >= 0) {
@@ -1099,7 +1179,7 @@ int main(int argc, const char **argv)
    }
  
    if (tree) {
-    crush.dump_tree(&cout, NULL);
+    crush.dump_tree(&cout, NULL, {}, true);
    }
  
    if (dump) {
@@ -1146,6 +1226,28 @@ int main(int argc, const char **argv)
        return EXIT_FAILURE;
    }
  
+  if (compare.size()) {
+    CrushWrapper crush2;
+    bufferlist in;
+    string error;
+    int r = in.read_file(compare.c_str(), &error);
+    if (r < 0) {
+      cerr << me << ": error reading '" << compare << "': "
+          << error << std::endl;
+      return EXIT_FAILURE;
+    }
+    auto p = in.begin();
+    try {
+      crush2.decode(p);
+    } catch(...) {
+      cerr << me << ": unable to decode " << compare << std::endl;
+      return EXIT_FAILURE;
+    }
+    r = tester.compare(crush2);
+    if (r < 0)
+      return EXIT_FAILURE;
+  }
+
    // output ---
    if (modified) {
      crush.finalize();
diff --git a/ceph/src/tools/rados/rados.cc b/ceph/src/tools/rados/rados.cc

index 04fb7458637772b20fde78173ef54248dc3aa954..b7bdf9f0e3aa94099890a820e84651f786664db9 100644 (file)
--- a/ceph/src/tools/rados/rados.cc
+++ b/ceph/src/tools/rados/rados.cc
@@ -1283,7 +1283,7 @@ static int do_cache_flush_evict_all(IoCtx& io_ctx, bool blocking)
        }
      }
    }
-  catch (const std::runtime_error& e) {
+  catch (const std::exception& e) {
      cerr << e.what() << std::endl;
      return -1;
    }
@@ -2252,7 +2252,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
           }
         }
        }
-      catch (const std::runtime_error& e) {
+      catch (const std::exception& e) {
         cerr << e.what() << std::endl;
         ret = -1;
         goto out;
@@ -2704,13 +2704,13 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
        const string & oid = *iter;
        if (use_striper) {
         if (forcefull) {
-         ret = striper.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+         ret = striper.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
         } else {
           ret = striper.remove(oid);
         }
        } else {
         if (forcefull) {
-         ret = io_ctx.remove(oid, CEPH_OSD_FLAG_FULL_FORCE);
+         ret = io_ctx.remove(oid, (CEPH_OSD_FLAG_FULL_FORCE | CEPH_OSD_FLAG_FULL_TRY));
         } else {
           ret = io_ctx.remove(oid);
         }
diff --git a/ceph/src/tools/rbd_mirror/ImageReplayer.cc b/ceph/src/tools/rbd_mirror/ImageReplayer.cc

index 9860089a303b6bdb6f50405f4add6692607d5630..11bc08bf48c74133a22a2f0a9e92be3eb0f58e40 100644 (file)
--- a/ceph/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/ceph/src/tools/rbd_mirror/ImageReplayer.cc
@@ -1503,9 +1503,8 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
        m_update_status_interval = new_interval;
      }
  
-    bool restarting = (new_interval == 0 || canceled_task);
      if (new_interval >= 0 && is_running_() &&
-        start_mirror_image_status_update(false, restarting)) {
+        start_mirror_image_status_update(true, false)) {
        m_update_status_task = new FunctionContext(
          [this](int r) {
            assert(m_threads->timer_lock.is_locked());
@@ -1520,6 +1519,7 @@ void ImageReplayer<I>::reschedule_update_status_task(int new_interval) {
  
    if (canceled_task) {
      dout(20) << "canceled task" << dendl;
+    // decrement in-flight status update counter for canceled task
      finish_mirror_image_status_update();
    }
  }
author	Alwin Antreich <a.antreich@proxmox.com>
	Wed, 6 Feb 2019 08:29:01 +0000 (09:29 +0100)
committer	Alwin Antreich <a.antreich@proxmox.com>
	Wed, 6 Feb 2019 09:34:02 +0000 (10:34 +0100)