From 05a536ef04248702f72713fd2fe81cb055624784 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Fri, 25 Aug 2023 14:05:24 +0200 Subject: [PATCH] update ceph source to reef 18.2.0 Signed-off-by: Thomas Lamprecht --- ceph/CMakeLists.txt | 2 +- ceph/PendingReleaseNotes | 11 +- ceph/README.md | 51 +- ceph/ceph.spec | 6 +- ceph/changelog.upstream | 16 +- ceph/debian/cephfs-mirror.install | 2 + ceph/doc/cephfs/createfs.rst | 4 + ceph/doc/cephfs/mds-config-ref.rst | 2 + ceph/doc/cephfs/troubleshooting.rst | 127 +++++ ceph/doc/dev/encoding.rst | 137 ++++- ceph/doc/dev/health-reports.rst | 59 -- ceph/doc/dev/perf_counters.rst | 51 +- ceph/doc/dev/release-checklists.rst | 2 +- ceph/doc/foundation.rst | 9 +- ceph/doc/governance.rst | 4 +- ceph/doc/mgr/ceph_api/index.rst | 22 +- ceph/doc/mgr/nfs.rst | 2 +- ceph/doc/mgr/rgw.rst | 4 +- ceph/doc/rados/configuration/ceph-conf.rst | 533 +++++++++--------- .../rados/configuration/mon-lookup-dns.rst | 2 + ceph/doc/rados/operations/add-or-rm-osds.rst | 373 ++++++------ ceph/doc/rados/operations/cache-tiering.rst | 3 +- ceph/doc/rados/operations/crush-map-edits.rst | 397 +++++++------ ceph/doc/rados/operations/crush-map.rst | 2 + ceph/doc/rados/operations/erasure-code.rst | 2 +- .../troubleshooting/troubleshooting-mon.rst | 5 +- ceph/doc/radosgw/admin.rst | 10 +- ceph/doc/radosgw/compression.rst | 4 + ceph/doc/radosgw/lua-scripting.rst | 9 +- ceph/doc/radosgw/multisite.rst | 52 +- ceph/doc/radosgw/s3select.rst | 75 +-- ceph/install-deps.sh | 5 +- ceph/qa/distros/all/centos_9.stream.yaml | 2 + ceph/qa/distros/all/centos_latest.yaml | 1 + .../centos_latest.yaml | 1 + .../qa/distros/supported/centos_8.stream.yaml | 1 + ceph/qa/distros/supported/centos_latest.yaml | 2 +- .../basic/tasks/rados_python.yaml | 6 +- .../rbd/tasks/rbd_python_api_tests.yaml | 6 +- .../rbd_python_api_tests_old_format.yaml | 6 +- .../fs/libcephfs/tasks/libcephfs_python.yaml | 7 + .../multiclient/tasks/cephfs_misc_tests.yaml | 1 + .../old_client/centos_8.yaml | 1 + .../old_client/centos_latest.yaml | 1 - .../upgraded_client/centos_8.yaml | 1 + .../upgraded_client/centos_latest.yaml | 1 - ceph/qa/suites/fs/upgrade/nofs/centos_8.yaml | 1 + .../suites/fs/upgrade/nofs/centos_latest.yaml | 1 - .../fs/upgrade/upgraded_client/centos_8.yaml | 1 + .../upgraded_client/centos_latest.yaml | 1 - .../cephadm/upgrade/3-upgrade/simple.yaml | 2 + .../rados/basic/tasks/rados_cls_all.yaml | 1 + .../rados/basic/tasks/rados_python.yaml | 7 + .../all/test_envlibrados_for_rocksdb/% | 0 .../all/test_envlibrados_for_rocksdb/.qa | 1 - .../supported/centos_latest.yaml | 1 - .../supported/rhel_latest.yaml | 1 - .../test_envlibrados_for_rocksdb.yaml | 22 - .../rbd_python_api_tests_old_format.yaml | 4 + .../librbd/workloads/python_api_tests.yaml | 5 + .../python_api_tests_with_defaults.yaml | 5 + .../python_api_tests_with_journaling.yaml | 5 + .../valgrind/workloads/python_api_tests.yaml | 5 + .../python_api_tests_with_defaults.yaml | 5 + .../python_api_tests_with_journaling.yaml | 5 + .../suites/rgw/tempest/tasks/rgw_tempest.yaml | 9 +- .../pacific/distro$/centos_8.stream.yaml | 1 + .../pacific/distro$/centos_latest.yaml | 1 - .../suites/rgw/verify/tasks/versioning.yaml | 5 + .../smoke/basic/tasks/test/rados_python.yaml | 5 + .../tasks/test/rbd_python_api_tests.yaml | 5 + .../parallel/workload/test_rbd_python.yaml | 5 + .../parallel/workload/test_rbd_python.yaml | 5 + ceph/qa/tasks/ceph_test_case.py | 18 +- ceph/qa/tasks/cephfs/filesystem.py | 3 + ceph/qa/tasks/cephfs/test_data_scan.py | 4 + ceph/qa/tasks/cephfs/test_failover.py | 23 +- ceph/qa/tasks/cephfs/test_misc.py | 43 ++ ceph/qa/tasks/cephfs/test_snapshots.py | 10 + ceph/qa/tasks/keystone.py | 134 +++-- ceph/qa/tasks/tempest.py | 2 +- ceph/qa/valgrind.supp | 33 +- .../workunits/fs/damage/test-first-damage.sh | 14 +- ceph/qa/workunits/fs/test_python.sh | 2 +- ceph/qa/workunits/rados/test_python.sh | 2 +- ceph/qa/workunits/rbd/qemu-iotests.sh | 20 +- ceph/qa/workunits/rbd/test_librbd_python.sh | 4 +- ceph/qa/workunits/rgw/common.py | 57 ++ ceph/qa/workunits/rgw/run-versioning.sh | 19 + ceph/qa/workunits/rgw/test_rgw_reshard.py | 70 +-- ceph/qa/workunits/rgw/test_rgw_versioning.py | 110 ++++ ceph/src/.git_version | 4 +- ceph/src/ceph-volume/ceph_volume/api/lvm.py | 1 - .../ceph_volume/devices/lvm/activate.py | 103 +--- .../ceph_volume/devices/lvm/batch.py | 58 +- .../ceph_volume/devices/lvm/common.py | 27 +- .../ceph_volume/devices/lvm/create.py | 4 +- .../ceph_volume/devices/lvm/prepare.py | 112 +--- .../ceph_volume/devices/lvm/zap.py | 3 +- .../ceph_volume/devices/raw/prepare.py | 2 +- .../ceph_volume/devices/simple/activate.py | 32 +- .../ceph_volume/tests/api/test_lvm.py | 14 +- .../tests/devices/lvm/test_activate.py | 179 ------ .../tests/devices/lvm/test_batch.py | 9 +- .../tests/devices/lvm/test_create.py | 34 -- .../tests/devices/lvm/test_prepare.py | 52 +- .../ceph_volume/tests/devices/lvm/test_zap.py | 2 +- .../tests/devices/simple/test_activate.py | 36 -- .../tests/devices/simple/test_scan.py | 2 +- .../tests/functional/batch/tox.ini | 8 +- .../lvm/centos8/filestore/create/Vagrantfile | 1 - .../centos8/filestore/create/group_vars/all | 1 - .../lvm/centos8/filestore/create/hosts | 8 - .../lvm/centos8/filestore/create/setup.yml | 1 - .../lvm/centos8/filestore/create/test.yml | 1 - .../filestore/create/vagrant_variables.yml | 1 - .../lvm/centos8/filestore/dmcrypt/Vagrantfile | 1 - .../centos8/filestore/dmcrypt/group_vars/all | 1 - .../lvm/centos8/filestore/dmcrypt/hosts | 8 - .../lvm/centos8/filestore/dmcrypt/setup.yml | 1 - .../lvm/centos8/filestore/dmcrypt/test.yml | 120 ---- .../filestore/dmcrypt/vagrant_variables.yml | 1 - .../lvm/playbooks/test_filestore.yml | 191 ------- .../ceph_volume/tests/functional/lvm/tox.ini | 5 +- .../tests/functional/simple/tox.ini | 5 +- .../ceph_volume/tests/systemd/test_main.py | 2 +- .../ceph_volume/tests/test_configuration.py | 2 +- .../ceph_volume/tests/test_decorators.py | 2 +- .../ceph_volume/tests/test_terminal.py | 2 +- .../tests/util/test_arg_validators.py | 18 +- .../ceph_volume/tests/util/test_device.py | 2 +- .../ceph_volume/tests/util/test_prepare.py | 108 ---- .../ceph_volume/tests/util/test_system.py | 2 +- .../ceph_volume/util/arg_validators.py | 2 +- .../ceph-volume/ceph_volume/util/device.py | 11 +- .../ceph_volume/util/encryption.py | 7 +- .../ceph-volume/ceph_volume/util/prepare.py | 72 --- ceph/src/ceph_release | 2 +- ceph/src/cephadm/cephadm.py | 10 +- ceph/src/client/Client.cc | 20 +- ceph/src/client/MetaRequest.h | 2 +- ceph/src/common/fault_injector.h | 33 +- ceph/src/common/options/mds.yaml.in | 20 + ceph/src/common/options/rgw.yaml.in | 20 + ceph/src/common/perf_counters.cc | 35 +- ceph/src/exporter/DaemonMetricCollector.cc | 82 ++- ceph/src/mds/CDentry.cc | 2 + ceph/src/mds/CDentry.h | 6 +- ceph/src/mds/CDir.cc | 1 + ceph/src/mds/CInode.h | 1 + ceph/src/mds/MDCache.cc | 7 + ceph/src/mds/MDCache.h | 15 + ceph/src/mds/MDSRank.cc | 2 + ceph/src/mds/MDSRank.h | 25 + ceph/src/mds/Server.cc | 140 ++++- ceph/src/mds/Server.h | 4 + ceph/src/mds/StrayManager.cc | 23 +- ceph/src/mds/events/EMetaBlob.h | 2 +- ceph/src/mds/journal.cc | 31 +- ceph/src/mds/mdstypes.cc | 13 +- ceph/src/messages/MClientReply.h | 6 +- ceph/src/mon/MDSMonitor.cc | 28 +- ceph/src/mon/MonClient.cc | 5 + ceph/src/mon/OSDMonitor.cc | 6 + ceph/src/msg/async/AsyncMessenger.cc | 38 +- ceph/src/os/bluestore/BlueStore.cc | 1 + ceph/src/pybind/cephfs/cephfs.pyx | 9 +- ceph/src/pybind/mgr/cephadm/migrations.py | 5 + ceph/src/pybind/mgr/cephadm/module.py | 2 +- ceph/src/pybind/mgr/cephadm/tests/fixtures.py | 4 +- .../mgr/cephadm/tests/test_migration.py | 90 +-- .../pybind/mgr/cephadm/tests/test_upgrade.py | 6 +- ceph/src/pybind/mgr/crash/module.py | 2 +- .../mgr/dashboard/ci/cephadm/start-cluster.sh | 14 - .../dashboard/frontend/dist/en-US/index.html | 2 +- ...aeea20ed40.js => main.8be028f171baab96.js} | 2 +- .../dashboard-area-chart.component.ts | 153 ++--- .../dashboard/dashboard-v3.component.html | 8 +- .../pipes/dimless-binary-per-second.pipe.ts | 16 +- .../shared/services/formatter.service.spec.ts | 22 + .../app/shared/services/formatter.service.ts | 43 ++ .../services/number-formatter.service.spec.ts | 16 + .../services/number-formatter.service.ts | 50 ++ .../rbd_support/mirror_snapshot_schedule.py | 5 +- ceph/src/pybind/mgr/rbd_support/module.py | 3 +- ceph/src/pybind/rados/rados.pyx | 39 +- ceph/src/pybind/rbd/rbd.pyx | 12 +- ceph/src/rgw/driver/dbstore/common/dbstore.cc | 15 +- ceph/src/rgw/driver/json_config/store.cc | 3 +- .../src/rgw/driver/rados/rgw_d3n_datacache.cc | 16 +- ceph/src/rgw/driver/rados/rgw_d3n_datacache.h | 2 +- ceph/src/rgw/driver/rados/rgw_rados.cc | 224 ++++++-- ceph/src/rgw/driver/rados/rgw_rados.h | 23 +- ceph/src/rgw/driver/rados/rgw_sal_rados.cc | 2 +- ceph/src/rgw/driver/rados/rgw_sal_rados.h | 3 + ceph/src/rgw/driver/rados/rgw_tools.cc | 15 +- ceph/src/rgw/driver/rados/rgw_tools.h | 5 +- ceph/src/rgw/driver/rados/rgw_zone.cc | 10 +- ceph/src/rgw/rgw_admin.cc | 14 +- ceph/src/rgw/rgw_aio.cc | 10 +- ceph/src/rgw/rgw_d3n_cacherequest.h | 18 +- ceph/src/rgw/rgw_op.cc | 27 +- ceph/src/rgw/rgw_sal.h | 2 + ceph/src/rgw/rgw_sal_daos.h | 3 + ceph/src/rgw/rgw_sal_dbstore.h | 3 + ceph/src/rgw/rgw_sal_filter.h | 3 + ceph/src/rgw/rgw_sal_motr.h | 3 + ceph/src/rgw/rgw_zone_features.h | 7 + ceph/src/test/perf_counters.cc | 475 +++++++++------- ceph/src/test/pybind/assertions.py | 26 + ceph/src/test/pybind/pytest.ini | 9 + ceph/src/test/pybind/test_cephfs.py | 164 ++---- ceph/src/test/pybind/test_rados.py | 113 ++-- ceph/src/test/pybind/test_rbd.py | 86 +-- ceph/src/test/pybind/test_rgwfs.py | 31 +- ceph/src/tools/cephfs/first-damage.py | 6 +- 216 files changed, 3346 insertions(+), 3010 deletions(-) create mode 100644 ceph/qa/distros/all/centos_9.stream.yaml create mode 120000 ceph/qa/distros/all/centos_latest.yaml create mode 120000 ceph/qa/distros/supported-random-distro$/centos_latest.yaml create mode 120000 ceph/qa/distros/supported/centos_8.stream.yaml create mode 120000 ceph/qa/suites/fs/upgrade/featureful_client/old_client/centos_8.yaml delete mode 120000 ceph/qa/suites/fs/upgrade/featureful_client/old_client/centos_latest.yaml create mode 120000 ceph/qa/suites/fs/upgrade/featureful_client/upgraded_client/centos_8.yaml delete mode 120000 ceph/qa/suites/fs/upgrade/featureful_client/upgraded_client/centos_latest.yaml create mode 120000 ceph/qa/suites/fs/upgrade/nofs/centos_8.yaml delete mode 120000 ceph/qa/suites/fs/upgrade/nofs/centos_latest.yaml create mode 120000 ceph/qa/suites/fs/upgrade/upgraded_client/centos_8.yaml delete mode 120000 ceph/qa/suites/fs/upgrade/upgraded_client/centos_latest.yaml delete mode 100644 ceph/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb/% delete mode 120000 ceph/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb/.qa delete mode 120000 ceph/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb/supported/centos_latest.yaml delete mode 120000 ceph/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb/supported/rhel_latest.yaml delete mode 100644 ceph/qa/suites/rados/singleton/all/test_envlibrados_for_rocksdb/test_envlibrados_for_rocksdb.yaml create mode 120000 ceph/qa/suites/rgw/upgrade/1-install/pacific/distro$/centos_8.stream.yaml delete mode 120000 ceph/qa/suites/rgw/upgrade/1-install/pacific/distro$/centos_latest.yaml create mode 100644 ceph/qa/suites/rgw/verify/tasks/versioning.yaml create mode 100755 ceph/qa/workunits/rgw/common.py create mode 100755 ceph/qa/workunits/rgw/run-versioning.sh create mode 100755 ceph/qa/workunits/rgw/test_rgw_versioning.py delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/Vagrantfile delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/group_vars/all delete mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/hosts delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/setup.yml delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/test.yml delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/create/vagrant_variables.yml delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/Vagrantfile delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/group_vars/all delete mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/hosts delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/setup.yml delete mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/test.yml delete mode 120000 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/centos8/filestore/dmcrypt/vagrant_variables.yml delete mode 100644 ceph/src/ceph-volume/ceph_volume/tests/functional/lvm/playbooks/test_filestore.yml rename ceph/src/pybind/mgr/dashboard/frontend/dist/en-US/{main.040e98aeea20ed40.js => main.8be028f171baab96.js} (53%) create mode 100644 ceph/src/pybind/mgr/dashboard/frontend/src/app/shared/services/number-formatter.service.spec.ts create mode 100644 ceph/src/pybind/mgr/dashboard/frontend/src/app/shared/services/number-formatter.service.ts create mode 100644 ceph/src/test/pybind/assertions.py create mode 100644 ceph/src/test/pybind/pytest.ini diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt index 1bf0ce095..91f4923c5 100644 --- a/ceph/CMakeLists.txt +++ b/ceph/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.16) project(ceph - VERSION 18.1.2 + VERSION 18.2.0 LANGUAGES CXX C ASM) cmake_policy(SET CMP0028 NEW) diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes index 02f1c818f..5113444f9 100644 --- a/ceph/PendingReleaseNotes +++ b/ceph/PendingReleaseNotes @@ -77,7 +77,11 @@ map and unmap images in namespaces using the `image-spec` syntax since then but the corresponding option available in most other commands was missing. * RGW: Compression is now supported for objects uploaded with Server-Side Encryption. - When both are enabled, compression is applied before encryption. + When both are enabled, compression is applied before encryption. Earlier releases + of multisite do not replicate such objects correctly, so all zones must upgrade to + Reef before enabling the `compress-encrypted` zonegroup feature: see + https://docs.ceph.com/en/reef/radosgw/multisite/#zone-features and note the + security considerations. * RGW: the "pubsub" functionality for storing bucket notifications inside Ceph is removed. Together with it, the "pubsub" zone should not be used anymore. The REST operations, as well as radosgw-admin commands for manipulating @@ -124,6 +128,9 @@ * RBD: list-watchers C++ API (`Image::list_watchers`) now clears the passed `std::list` before potentially appending to it, aligning with the semantics of the corresponding C API (`rbd_watchers_list`). +* The rados python binding is now able to process (opt-in) omap keys as bytes + objects. This enables interacting with RADOS omap keys that are not decodeable as + UTF-8 strings. * Telemetry: Users who are opted-in to telemetry can also opt-in to participating in a leaderboard in the telemetry public dashboards (https://telemetry-public.ceph.com/). Users can now also add a @@ -162,6 +169,8 @@ fixes and enhancements. * For more detailed information see: https://docs.ceph.com/en/reef/rados/configuration/mclock-config-ref/ +* CEPHFS: After recovering a Ceph File System post following the disaster recovery + procedure, the recovered files under `lost+found` directory can now be deleted. >=17.2.1 diff --git a/ceph/README.md b/ceph/README.md index c39f98707..1f00c5dd1 100644 --- a/ceph/README.md +++ b/ceph/README.md @@ -23,33 +23,49 @@ contributed under the terms of the applicable license. ## Checking out the source -You can clone from github with +Clone the ceph/ceph repository from github by running the following command on +a system that has git installed: git clone git@github.com:ceph/ceph -or, if you are not a github user, +Alternatively, if you are not a github user, you should run the following +command on a system that has git installed: git clone https://github.com/ceph/ceph.git -Ceph contains many git submodules that need to be checked out with +When the ceph/ceph repository has been cloned to your system, run the following +command to check out the git submodules associated with the ceph/ceph +repository: git submodule update --init --recursive ## Build Prerequisites -The list of Debian or RPM packages dependencies can be installed with: +*section last updated 27 Jul 2023* + +Make sure that ``curl`` is installed. The Debian and Ubuntu ``apt`` command is +provided here, but if you use a system with a different package manager, then +you must use whatever command is the proper counterpart of this one: + + apt install curl + +Install Debian or RPM package dependencies by running the following command: ./install-deps.sh +Install the ``python3-routes`` package: + + apt install python3-routes + ## Building Ceph -Note that these instructions are meant for developers who are -compiling the code for development and testing. To build binaries -suitable for installation we recommend you build deb or rpm packages -or refer to the `ceph.spec.in` or `debian/rules` to see which -configuration options are specified for production builds. +These instructions are meant for developers who are compiling the code for +development and testing. To build binaries that are suitable for installation +we recommend that you build .deb or .rpm packages, or refer to ``ceph.spec.in`` +or ``debian/rules`` to see which configuration options are specified for +production builds. Build instructions: @@ -57,21 +73,20 @@ Build instructions: cd build ninja -(do_cmake.sh now defaults to creating a debug build of ceph that can -be up to 5x slower with some workloads. Please pass -"-DCMAKE_BUILD_TYPE=RelWithDebInfo" to do_cmake.sh to create a non-debug -release. +``do_cmake.sh`` defaults to creating a debug build of Ceph that can be up to 5x +slower with some workloads. Pass ``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to +``do_cmake.sh`` to create a non-debug release. The number of jobs used by `ninja` is derived from the number of CPU cores of the building host if unspecified. Use the `-j` option to limit the job number if the build jobs are running out of memory. On average, each job takes around -2.5GiB memory.) +2.5GiB memory. -This assumes you make your build dir a subdirectory of the ceph.git +This assumes that you make your build directory a subdirectory of the ceph.git checkout. If you put it elsewhere, just point `CEPH_GIT_DIR` to the correct -path to the checkout. Any additional CMake args can be specified by setting ARGS -before invoking do_cmake. See [cmake options](#cmake-options) -for more details. Eg. +path to the checkout. Additional CMake args can be specified by setting ARGS +before invoking ``do_cmake.sh``. See [cmake options](#cmake-options) +for more details. For example: ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh diff --git a/ceph/ceph.spec b/ceph/ceph.spec index 49a297f9e..0d559f923 100644 --- a/ceph/ceph.spec +++ b/ceph/ceph.spec @@ -170,7 +170,7 @@ # main package definition ################################################################################# Name: ceph -Version: 18.1.2 +Version: 18.2.0 Release: 0%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 @@ -186,7 +186,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD- Group: System/Filesystems %endif URL: http://ceph.com/ -Source0: %{?_remote_tarball_prefix}ceph-18.1.2.tar.bz2 +Source0: %{?_remote_tarball_prefix}ceph-18.2.0.tar.bz2 %if 0%{?suse_version} # _insert_obs_source_lines_here ExclusiveArch: x86_64 aarch64 ppc64le s390x @@ -1292,7 +1292,7 @@ This package provides a Ceph MIB for SNMP traps. # common ################################################################################# %prep -%autosetup -p1 -n ceph-18.1.2 +%autosetup -p1 -n ceph-18.2.0 %build # Disable lto on systems that do not support symver attribute diff --git a/ceph/changelog.upstream b/ceph/changelog.upstream index 584621aef..f76511bb4 100644 --- a/ceph/changelog.upstream +++ b/ceph/changelog.upstream @@ -1,7 +1,19 @@ -ceph (18.1.2-1jammy) jammy; urgency=medium +ceph (18.2.0-1jammy) jammy; urgency=medium - -- Jenkins Build Slave User Tue, 27 Jun 2023 20:13:15 +0000 + -- Jenkins Build Slave User Thu, 03 Aug 2023 18:57:50 +0000 + +ceph (18.2.0-1) stable; urgency=medium + + * New upstream release + + -- Ceph Release Team Thu, 03 Aug 2023 16:53:10 +0000 + +ceph (18.1.3-1) rc; urgency=medium + + * New upstream release + + -- Ceph Release Team Tue, 25 Jul 2023 02:48:09 +0000 ceph (18.1.2-1) rc; urgency=medium diff --git a/ceph/debian/cephfs-mirror.install b/ceph/debian/cephfs-mirror.install index 19d2e483d..ca0807e27 100644 --- a/ceph/debian/cephfs-mirror.install +++ b/ceph/debian/cephfs-mirror.install @@ -1 +1,3 @@ +lib/systemd/system/cephfs-mirror* usr/bin/cephfs-mirror +usr/share/man/man8/cephfs-mirror.8 diff --git a/ceph/doc/cephfs/createfs.rst b/ceph/doc/cephfs/createfs.rst index 59706d1d2..4a282e562 100644 --- a/ceph/doc/cephfs/createfs.rst +++ b/ceph/doc/cephfs/createfs.rst @@ -15,6 +15,10 @@ There are important considerations when planning these pools: - We recommend the fastest feasible low-latency storage devices (NVMe, Optane, or at the very least SAS/SATA SSD) for the metadata pool, as this will directly affect the latency of client file system operations. +- We strongly suggest that the CephFS metadata pool be provisioned on dedicated + SSD / NVMe OSDs. This ensures that high client workload does not adversely + impact metadata operations. See :ref:`device_classes` to configure pools this + way. - The data pool used to create the file system is the "default" data pool and the location for storing all inode backtrace information, which is used for hard link management and disaster recovery. For this reason, all CephFS inodes diff --git a/ceph/doc/cephfs/mds-config-ref.rst b/ceph/doc/cephfs/mds-config-ref.rst index a5e0bba91..5b68053a0 100644 --- a/ceph/doc/cephfs/mds-config-ref.rst +++ b/ceph/doc/cephfs/mds-config-ref.rst @@ -57,6 +57,8 @@ .. confval:: mds_kill_import_at .. confval:: mds_kill_link_at .. confval:: mds_kill_rename_at +.. confval:: mds_inject_skip_replaying_inotable +.. confval:: mds_kill_skip_replaying_inotable .. confval:: mds_wipe_sessions .. confval:: mds_wipe_ino_prealloc .. confval:: mds_skip_ino diff --git a/ceph/doc/cephfs/troubleshooting.rst b/ceph/doc/cephfs/troubleshooting.rst index f4cd5f6c9..b58d2469f 100644 --- a/ceph/doc/cephfs/troubleshooting.rst +++ b/ceph/doc/cephfs/troubleshooting.rst @@ -21,6 +21,133 @@ We can get hints about what's going on by dumping the MDS cache :: If high logging levels are set on the MDS, that will almost certainly hold the information we need to diagnose and solve the issue. +Stuck during recovery +===================== + +Stuck in up:replay +------------------ + +If your MDS is stuck in ``up:replay`` then it is likely that the journal is +very long. Did you see ``MDS_HEALTH_TRIM`` cluster warnings saying the MDS is +behind on trimming its journal? If the journal has grown very large, it can +take hours to read the journal. There is no working around this but there +are things you can do to speed things along: + +Reduce MDS debugging to 0. Even at the default settings, the MDS logs some +messages to memory for dumping if a fatal error is encountered. You can avoid +this: + +.. code:: bash + + ceph config set mds debug_mds 0 + ceph config set mds debug_ms 0 + ceph config set mds debug_monc 0 + +Note if the MDS fails then there will be virtually no information to determine +why. If you can calculate when ``up:replay`` will complete, you should restore +these configs just prior to entering the next state: + +.. code:: bash + + ceph config rm mds debug_mds + ceph config rm mds debug_ms + ceph config rm mds debug_monc + +Once you've got replay moving along faster, you can calculate when the MDS will +complete. This is done by examining the journal replay status: + +.. code:: bash + + $ ceph tell mds.:0 status | jq .replay_status + { + "journal_read_pos": 4195244, + "journal_write_pos": 4195244, + "journal_expire_pos": 4194304, + "num_events": 2, + "num_segments": 2 + } + +Replay completes when the ``journal_read_pos`` reaches the +``journal_write_pos``. The write position will not change during replay. Track +the progression of the read position to compute the expected time to complete. + + +Avoiding recovery roadblocks +---------------------------- + +When trying to urgently restore your file system during an outage, here are some +things to do: + +* **Deny all reconnect to clients.** This effectively blocklists all existing + CephFS sessions so all mounts will hang or become unavailable. + +.. code:: bash + + ceph config set mds mds_deny_all_reconnect true + + Remember to undo this after the MDS becomes active. + +.. note:: This does not prevent new sessions from connecting. For that, see the ``refuse_client_session`` file system setting. + +* **Extend the MDS heartbeat grace period**. This avoids replacing an MDS that appears + "stuck" doing some operation. Sometimes recovery of an MDS may involve an + operation that may take longer than expected (from the programmer's + perspective). This is more likely when recovery is already taking a longer than + normal amount of time to complete (indicated by your reading this document). + Avoid unnecessary replacement loops by extending the heartbeat graceperiod: + +.. code:: bash + + ceph config set mds mds_heartbeat_reset_grace 3600 + + This has the effect of having the MDS continue to send beacons to the monitors + even when its internal "heartbeat" mechanism has not been reset (beat) in one + hour. Note the previous mechanism for achieving this was via the + `mds_beacon_grace` monitor setting. + +* **Disable open file table prefetch.** Normally, the MDS will prefetch + directory contents during recovery to heat up its cache. During long + recovery, the cache is probably already hot **and large**. So this behavior + can be undesirable. Disable using: + +.. code:: bash + + ceph config set mds mds_oft_prefetch_dirfrags false + +* **Turn off clients.** Clients reconnecting to the newly ``up:active`` MDS may + cause new load on the file system when it's just getting back on its feet. + There will likely be some general maintenance to do before workloads should be + resumed. For example, expediting journal trim may be advisable if the recovery + took a long time because replay was reading a overly large journal. + + You can do this manually or use the new file system tunable: + +.. code:: bash + + ceph fs set refuse_client_session true + + That prevents any clients from establishing new sessions with the MDS. + + + +Expediting MDS journal trim +=========================== + +If your MDS journal grew too large (maybe your MDS was stuck in up:replay for a +long time!), you will want to have the MDS trim its journal more frequently. +You will know the journal is too large because of ``MDS_HEALTH_TRIM`` warnings. + +The main tunable available to do this is to modify the MDS tick interval. The +"tick" interval drives several upkeep activities in the MDS. It is strongly +recommended no significant file system load be present when modifying this tick +interval. This setting only affects an MDS in ``up:active``. The MDS does not +trim its journal during recovery. + +.. code:: bash + + ceph config set mds mds_tick_interval 2 + + RADOS Health ============ diff --git a/ceph/doc/dev/encoding.rst b/ceph/doc/dev/encoding.rst index 013046f33..8ec3bb22d 100644 --- a/ceph/doc/dev/encoding.rst +++ b/ceph/doc/dev/encoding.rst @@ -3,9 +3,74 @@ Serialization (encode/decode) ============================= When a structure is sent over the network or written to disk, it is -encoded into a string of bytes. Serializable structures have -``encode`` and ``decode`` methods that write and read from ``bufferlist`` -objects representing byte strings. +encoded into a string of bytes. Usually (but not always -- multiple +serialization facilities coexist in Ceph) serializable structures +have ``encode`` and ``decode`` methods that write and read from +``bufferlist`` objects representing byte strings. + +Terminology +----------- +It is best to think not in the domain of daemons and clients but +encoders and decoders. An encoder serializes a structure into a bufferlist +while a decoder does the opposite. + +Encoders and decoders can be referred collectively as dencoders. + +Dencoders (both encoders and docoders) live within daemons and clients. +For instance, when an RBD client issues an IO operation, it prepares +an instance of the ``MOSDOp`` structure and encodes it into a bufferlist +that is put on the wire. +An OSD reads these bytes and decodes them back into an ``MOSDOp`` instance. +Here encoder was used by the client while decoder by the OSD. However, +these roles can swing -- just imagine handling of the response: OSD encodes +the ``MOSDOpReply`` while RBD clients decode. + +Encoder and decoder operate accordingly to a format which is defined +by a programmer by implementing the ``encode`` and ``decode`` methods. + +Principles for format change +---------------------------- +It is not unusual that the format of serialization changes. This +process requires careful attention from during both development +and review. + +The general rule is that a decoder must understand what had been +encoded by an encoder. Most of the problems come from ensuring +that compatibility continues between old decoders and new encoders +as well as new decoders and old decoders. One should assume +that -- if not otherwise derogated -- any mix (old/new) is +possible in a cluster. There are 2 main reasons for that: + +1. Upgrades. Although there are recommendations related to the order + of entity types (mons/osds/clients), it is not mandatory and + no assumption should be made about it. +2. Huge variability of client versions. It was always the case + that kernel (and thus kernel clients) upgrades are decoupled + from Ceph upgrades. Moreover, proliferation of containerization + bring the variability even to e.g. ``librbd`` -- now user space + libraries live on the container own. + +With this being said, there are few rules limiting the degree +of interoperability between dencoders: + +* ``n-2`` for dencoding between daemons, +* ``n-3`` hard requirement for client-involved scenarios, +* ``n-3..`` soft requirements for clinet-involved scenarios. Ideally + every client should be able to talk any version of daemons. + +As the underlying reasons are the same, the rules dencoders +follow are virtually the same as for deprecations of our features +bits. See the ``Notes on deprecation`` in ``src/include/ceph_features.h``. + +Frameworks +---------- +Currently multiple genres of dencoding helpers co-exist. + +* encoding.h (the most proliferated one), +* denc.h (performance optimized, seen mostly in ``BlueStore``), +* the `Message` hierarchy. + +Although details vary, the interoperability rules stay the same. Adding a field to a structure ----------------------------- @@ -93,3 +158,69 @@ because we might still be passed older-versioned messages that do not have the field. The ``struct_v`` variable is a local set by the ``DECODE_START`` macro. +# Into the weeeds + +The append-extendability of our dencoders is a result of the forward +compatibility that the ``ENCODE_START`` and ``DECODE_FINISH`` macros bring. + +They are implementing extendibility facilities. An encoder, when filling +the bufferlist, prepends three fields: version of the current format, +minimal version of a decoder compatible with it and the total size of +all encoded fields. + +.. code-block:: cpp + + /** + * start encoding block + * + * @param v current (code) version of the encoding + * @param compat oldest code version that can decode it + * @param bl bufferlist to encode to + * + */ + #define ENCODE_START(v, compat, bl) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + ceph_le32 struct_len; \ + auto filler = (bl).append_hole(sizeof(struct_v) + \ + sizeof(struct_compat) + sizeof(struct_len)); \ + const auto starting_bl_len = (bl).length(); \ + using ::ceph::encode; \ + do { + +The ``struct_len`` field allows the decoder to eat all the bytes that were +left undecoded in the user-provided ``decode`` implementation. +Analogically, decoders tracks how much input has been decoded in the +user-provided ``decode`` methods. + +.. code-block:: cpp + + #define DECODE_START(bl) \ + unsigned struct_end = 0; \ + __u32 struct_len; \ + decode(struct_len, bl); \ + ... \ + struct_end = bl.get_off() + struct_len; \ + } \ + do { + + +Decoder uses this information to discard the extra bytes it does not +understand. Advancing bufferlist is critical as dencoders tend to be nested; +just leaving it intact would work only for the very last ``deocde`` call +in a nested structure. + +.. code-block:: cpp + + #define DECODE_FINISH(bl) \ + } while (false); \ + if (struct_end) { \ + ... \ + if (bl.get_off() < struct_end) \ + bl += struct_end - bl.get_off(); \ + } + + +This entire, cooperative mechanism allows encoder (its further revisions) +to generate more byte stream (due to e.g. adding a new field at the end) +and not worry that the residue will crash older decoder revisions. diff --git a/ceph/doc/dev/health-reports.rst b/ceph/doc/dev/health-reports.rst index 4a6a7d671..7769c6d8c 100644 --- a/ceph/doc/dev/health-reports.rst +++ b/ceph/doc/dev/health-reports.rst @@ -16,32 +16,6 @@ mgr module The following diagrams outline the involved parties and how the interact when the clients query for the reports: -.. seqdiag:: - - seqdiag { - default_note_color = lightblue; - osd; mon; ceph-cli; - osd => mon [ label = "update osdmap service" ]; - osd => mon [ label = "update osdmap service" ]; - ceph-cli -> mon [ label = "send 'health' command" ]; - mon -> mon [ leftnote = "gather checks from services" ]; - ceph-cli <-- mon [ label = "checks and mutes" ]; - } - -.. seqdiag:: - - seqdiag { - default_note_color = lightblue; - osd; mon; mgr; mgr-module; - mgr -> mon [ label = "subscribe for 'mgrdigest'" ]; - osd => mon [ label = "update osdmap service" ]; - osd => mon [ label = "update osdmap service" ]; - mon -> mgr [ label = "send MMgrDigest" ]; - mgr -> mgr [ note = "update cluster state" ]; - mon <-- mgr; - mgr-module -> mgr [ label = "mgr.get('health')" ]; - mgr-module <-- mgr [ label = "heath reports in json" ]; - } Where are the Reports Generated =============================== @@ -68,19 +42,6 @@ later loaded and decoded, so they can be collected on demand. When it comes to ``MDSMonitor``, it persists the health metrics in the beacon sent by the MDS daemons, and prepares health reports when storing the pending changes. -.. seqdiag:: - - seqdiag { - default_note_color = lightblue; - mds; mon-mds; mon-health; ceph-cli; - mds -> mon-mds [ label = "send beacon" ]; - mon-mds -> mon-mds [ note = "store health metrics in beacon" ]; - mds <-- mon-mds; - mon-mds -> mon-mds [ note = "encode_health(checks)" ]; - ceph-cli -> mon-health [ label = "send 'health' command" ]; - mon-health => mon-mds [ label = "gather health checks" ]; - ceph-cli <-- mon-health [ label = "checks and mutes" ]; - } So, if we want to add a new warning related to cephfs, probably the best place to start is ``MDSMonitor::encode_pending()``, where health reports are collected from @@ -106,23 +67,3 @@ metrics and status to mgr using ``MMgrReport``. On the mgr side, it periodically an aggregated report to the ``MgrStatMonitor`` service on mon. As explained earlier, this service just persists the health reports in the aggregated report to the monstore. -.. seqdiag:: - - seqdiag { - default_note_color = lightblue; - service; mgr; mon-mgr-stat; mon-health; - service -> mgr [ label = "send(open)" ]; - mgr -> mgr [ note = "register the new service" ]; - service <-- mgr; - mgr => service [ label = "send(configure)" ]; - service -> mgr [ label = "send(report)" ]; - mgr -> mgr [ note = "update/aggregate service metrics" ]; - service <-- mgr; - service => mgr [ label = "send(report)" ]; - mgr -> mon-mgr-stat [ label = "send(mgr-report)" ]; - mon-mgr-stat -> mon-mgr-stat [ note = "store health checks in the report" ]; - mgr <-- mon-mgr-stat; - mon-health => mon-mgr-stat [ label = "gather health checks" ]; - service => mgr [ label = "send(report)" ]; - service => mgr [ label = "send(close)" ]; - } diff --git a/ceph/doc/dev/perf_counters.rst b/ceph/doc/dev/perf_counters.rst index b71939282..a64d14d33 100644 --- a/ceph/doc/dev/perf_counters.rst +++ b/ceph/doc/dev/perf_counters.rst @@ -208,31 +208,32 @@ A Ceph daemon has the ability to emit a set of perf counter instances with varyi For example, the below counters show the number of put requests for different users on different buckets:: { - "rgw": { - "labels": { - "Bucket: "bkt1", - "User: "user1", - }, - "counters": { - "put": 1, - }, - }, - "rgw": { - "labels": { - }, - "counters": { - "put": 4, - }, - }, - "rgw": { - "labels": { - "Bucket: "bkt1", - "User: "user2", - }, - "counters": { - "put": 3, - }, - } + "rgw": [ + { + "labels": { + "Bucket: "bkt1", + "User: "user1", + }, + "counters": { + "put": 1, + }, + }, + { + "labels": {}, + "counters": { + "put": 4, + }, + }, + { + "labels": { + "Bucket: "bkt1", + "User: "user2", + }, + "counters": { + "put": 3, + }, + }, + ] } All labeled and unlabeled perf counters can be viewed with ``ceph daemon {daemon id} counter dump``. diff --git a/ceph/doc/dev/release-checklists.rst b/ceph/doc/dev/release-checklists.rst index f65d0590b..5d296621c 100644 --- a/ceph/doc/dev/release-checklists.rst +++ b/ceph/doc/dev/release-checklists.rst @@ -137,6 +137,6 @@ First release candidate First stable release ==================== -- [ ] src/ceph_release: change type `stable` +- [x] src/ceph_release: change type `stable` - [ ] generate new object corpus for encoding/decoding tests - see :doc:`corpus` - [ ] src/cephadm/cephadm: update `LATEST_STABLE_RELEASE` diff --git a/ceph/doc/foundation.rst b/ceph/doc/foundation.rst index 94130e44d..95a393410 100644 --- a/ceph/doc/foundation.rst +++ b/ceph/doc/foundation.rst @@ -29,21 +29,21 @@ Premier * `Bloomberg `_ * `Clyso `_ -* `DigitalOcean `_ * `IBM `_ * `Intel `_ * `OVH `_ * `Samsung Electronics `_ * `Western Digital `_ -* `XSKY `_ -* `ZTE `_ General ------- +* `42on `_ +* `Akamai `_ * `ARM `_ * `Canonical `_ * `Cloudbase Solutions `_ +* `CloudFerro `_ * `croit `_ * `EasyStack `_ * `ISS `_ @@ -96,7 +96,6 @@ Members ------- * Anjaneya "Reddy" Chagam (Intel) -* Alex Marangone (DigitalOcean) * Carlos Maltzahn (UCSC) - Associate member representative * Dan van der Ster (CERN) - Ceph Council representative * Haomai Wang (XSKY) @@ -111,8 +110,6 @@ Members * Steven Umbehocker (OSNexus) - General member representative * Pawel Sadowski (OVH) * Vincent Hsu (IBM) -* Xie Xingguo (ZTE) -* Zhang Shaowen (China Mobile) Joining ======= diff --git a/ceph/doc/governance.rst b/ceph/doc/governance.rst index a08a32228..493a87666 100644 --- a/ceph/doc/governance.rst +++ b/ceph/doc/governance.rst @@ -80,8 +80,8 @@ Current Members * Adam King * Casey Bodley - * Dan van der Ster - * David Orman + * Dan van der Ster + * David Orman * Ernesto Puerta * Gregory Farnum * Haomai Wang diff --git a/ceph/doc/mgr/ceph_api/index.rst b/ceph/doc/mgr/ceph_api/index.rst index 5785bf130..1cdc9a97b 100644 --- a/ceph/doc/mgr/ceph_api/index.rst +++ b/ceph/doc/mgr/ceph_api/index.rst @@ -41,14 +41,16 @@ So, prior to start consuming the Ceph API, a valid JSON Web Token (JWT) has to be obtained, and it may then be reused for subsequent requests. The ``/api/auth`` endpoint will provide the valid token: -.. code-block:: sh +.. prompt:: bash $ - $ curl -X POST "https://example.com:8443/api/auth" \ - -H "Accept: application/vnd.ceph.api.v1.0+json" \ - -H "Content-Type: application/json" \ - -d '{"username": , "password": }' + curl -X POST "https://example.com:8443/api/auth" \ + -H "Accept: application/vnd.ceph.api.v1.0+json" \ + -H "Content-Type: application/json" \ + -d '{"username": , "password": }' - { "token": "", ...} +:: + + { "token": "", ...} The token obtained must be passed together with every API request in the ``Authorization`` HTTP header:: @@ -74,11 +76,11 @@ purpose, Ceph API is built upon the following principles: An example: -.. code-block:: bash +.. prompt:: bash $ - $ curl -X GET "https://example.com:8443/api/osd" \ - -H "Accept: application/vnd.ceph.api.v1.0+json" \ - -H "Authorization: Bearer " + curl -X GET "https://example.com:8443/api/osd" \ + -H "Accept: application/vnd.ceph.api.v1.0+json" \ + -H "Authorization: Bearer " Specification diff --git a/ceph/doc/mgr/nfs.rst b/ceph/doc/mgr/nfs.rst index beec399e9..c25410fcd 100644 --- a/ceph/doc/mgr/nfs.rst +++ b/ceph/doc/mgr/nfs.rst @@ -31,7 +31,7 @@ Create NFS Ganesha Cluster .. code:: bash - $ nfs cluster create [] [--ingress] [--virtual_ip ] [--ingress-mode {default|keepalive-only}] [--port ] + $ ceph nfs cluster create [] [--ingress] [--virtual_ip ] [--ingress-mode {default|keepalive-only|haproxy-standard|haproxy-protocol}] [--port ] This creates a common recovery pool for all NFS Ganesha daemons, new user based on ``cluster_id``, and a common NFS Ganesha config RADOS object. diff --git a/ceph/doc/mgr/rgw.rst b/ceph/doc/mgr/rgw.rst index dc280b06d..a3f53280a 100644 --- a/ceph/doc/mgr/rgw.rst +++ b/ceph/doc/mgr/rgw.rst @@ -25,7 +25,7 @@ supports both passing the arguments through the cmd line or as a spec file: .. prompt:: bash # - rgw realm bootstrap [--realm-name] [--zonegroup-name] [--zone-name] [--port] [--placement] [--start-radosgw] + ceph rgw realm bootstrap [--realm-name] [--zonegroup-name] [--zone-name] [--port] [--placement] [--start-radosgw] The command supports providing the configuration through a spec file (`-i option`): @@ -33,7 +33,7 @@ The command supports providing the configuration through a spec file (`-i option ceph rgw realm bootstrap -i myrgw.yaml -Following is an example of RGW mutlisite spec file: +Following is an example of RGW multisite spec file: .. code-block:: yaml diff --git a/ceph/doc/rados/configuration/ceph-conf.rst b/ceph/doc/rados/configuration/ceph-conf.rst index e5d2a37b3..f62a21545 100644 --- a/ceph/doc/rados/configuration/ceph-conf.rst +++ b/ceph/doc/rados/configuration/ceph-conf.rst @@ -4,116 +4,116 @@ Configuring Ceph ================== -When Ceph services start, the initialization process activates a series -of daemons that run in the background. A :term:`Ceph Storage Cluster` runs -at a minimum three types of daemons: +When Ceph services start, the initialization process activates a series of +daemons that run in the background. A :term:`Ceph Storage Cluster` runs at +least three types of daemons: - :term:`Ceph Monitor` (``ceph-mon``) - :term:`Ceph Manager` (``ceph-mgr``) - :term:`Ceph OSD Daemon` (``ceph-osd``) Ceph Storage Clusters that support the :term:`Ceph File System` also run at -least one :term:`Ceph Metadata Server` (``ceph-mds``). Clusters that -support :term:`Ceph Object Storage` run Ceph RADOS Gateway daemons -(``radosgw``) as well. +least one :term:`Ceph Metadata Server` (``ceph-mds``). Clusters that support +:term:`Ceph Object Storage` run Ceph RADOS Gateway daemons (``radosgw``). -Each daemon has a number of configuration options, each of which has a -default value. You may adjust the behavior of the system by changing these -configuration options. Be careful to understand the consequences before +Each daemon has a number of configuration options, each of which has a default +value. You may adjust the behavior of the system by changing these +configuration options. Be careful to understand the consequences before overriding default values, as it is possible to significantly degrade the -performance and stability of your cluster. Also note that default values -sometimes change between releases, so it is best to review the version of -this documentation that aligns with your Ceph release. +performance and stability of your cluster. Note too that default values +sometimes change between releases. For this reason, it is best to review the +version of this documentation that applies to your Ceph release. Option names ============ -All Ceph configuration options have a unique name consisting of words -formed with lower-case characters and connected with underscore -(``_``) characters. +Each of the Ceph configuration options has a unique name that consists of words +formed with lowercase characters and connected with underscore characters +(``_``). -When option names are specified on the command line, either underscore -(``_``) or dash (``-``) characters can be used interchangeable (e.g., +When option names are specified on the command line, underscore (``_``) and +dash (``-``) characters can be used interchangeably (for example, ``--mon-host`` is equivalent to ``--mon_host``). -When option names appear in configuration files, spaces can also be -used in place of underscore or dash. We suggest, though, that for -clarity and convenience you consistently use underscores, as we do +When option names appear in configuration files, spaces can also be used in +place of underscores or dashes. However, for the sake of clarity and +convenience, we suggest that you consistently use underscores, as we do throughout this documentation. Config sources ============== -Each Ceph daemon, process, and library will pull its configuration -from several sources, listed below. Sources later in the list will -override those earlier in the list when both are present. +Each Ceph daemon, process, and library pulls its configuration from one or more +of the several sources listed below. Sources that occur later in the list +override those that occur earlier in the list (when both are present). - the compiled-in default value - the monitor cluster's centralized configuration database - a configuration file stored on the local host - environment variables -- command line arguments -- runtime overrides set by an administrator +- command-line arguments +- runtime overrides that are set by an administrator One of the first things a Ceph process does on startup is parse the -configuration options provided via the command line, environment, and -local configuration file. The process will then contact the monitor -cluster to retrieve configuration stored centrally for the entire -cluster. Once a complete view of the configuration is available, the -daemon or process startup will proceed. +configuration options provided via the command line, via the environment, and +via the local configuration file. Next, the process contacts the monitor +cluster to retrieve centrally-stored configuration for the entire cluster. +After a complete view of the configuration is available, the startup of the +daemon or process will commence. .. _bootstrap-options: Bootstrap options ----------------- -Some configuration options affect the process's ability to contact the -monitors, to authenticate, and to retrieve the cluster-stored configuration. -For this reason, these options might need to be stored locally on the node, and -set by means of a local configuration file. These options include the -following: +Bootstrap options are configuration options that affect the process's ability +to contact the monitors, to authenticate, and to retrieve the cluster-stored +configuration. For this reason, these options might need to be stored locally +on the node, and set by means of a local configuration file. These options +include the following: .. confval:: mon_host .. confval:: mon_host_override - :confval:`mon_dns_srv_name` -- :confval:`mon_data`, :confval:`osd_data`, :confval:`mds_data`, :confval:`mgr_data`, and - similar options that define which local directory the daemon - stores its data in. -- :confval:`keyring`, :confval:`keyfile`, and/or :confval:`key`, which can be used to - specify the authentication credential to use to authenticate with - the monitor. Note that in most cases the default keyring location - is in the data directory specified above. - -In most cases, the default values of these options are suitable. There is one -exception to this: the :confval:`mon_host` option that identifies the addresses -of the cluster's monitors. When DNS is used to identify monitors, a local Ceph +- :confval:`mon_data`, :confval:`osd_data`, :confval:`mds_data`, + :confval:`mgr_data`, and similar options that define which local directory + the daemon stores its data in. +- :confval:`keyring`, :confval:`keyfile`, and/or :confval:`key`, which can be + used to specify the authentication credential to use to authenticate with the + monitor. Note that in most cases the default keyring location is in the data + directory specified above. + +In most cases, there is no reason to modify the default values of these +options. However, there is one exception to this: the :confval:`mon_host` +option that identifies the addresses of the cluster's monitors. But when +:ref:`DNS is used to identify monitors`, a local Ceph configuration file can be avoided entirely. + Skipping monitor config ----------------------- -Pass the option ``--no-mon-config`` to any process to skip the step that -retrieves configuration information from the cluster monitors. This is useful -in cases where configuration is managed entirely via configuration files, or -when the monitor cluster is down and some maintenance activity needs to be -done. - +The option ``--no-mon-config`` can be passed in any command in order to skip +the step that retrieves configuration information from the cluster's monitors. +Skipping this retrieval step can be useful in cases where configuration is +managed entirely via configuration files, or when maintenance activity needs to +be done but the monitor cluster is down. .. _ceph-conf-file: - Configuration sections ====================== -Any given process or daemon has a single value for each configuration -option. However, values for an option may vary across different -daemon types even daemons of the same type. Ceph options that are -stored in the monitor configuration database or in local configuration -files are grouped into sections to indicate which daemons or clients -they apply to. +Each of the configuration options associated with a single process or daemon +has a single value. However, the values for a configuration option can vary +across daemon types, and can vary even across different daemons of the same +type. Ceph options that are stored in the monitor configuration database or in +local configuration files are grouped into sections |---| so-called "configuration +sections" |---| to indicate which daemons or clients they apply to. + -These sections include: +These sections include the following: .. confsec:: global @@ -156,43 +156,42 @@ These sections include: .. confsec:: client - Settings under ``client`` affect all Ceph Clients - (e.g., mounted Ceph File Systems, mounted Ceph Block Devices, - etc.) as well as Rados Gateway (RGW) daemons. + Settings under ``client`` affect all Ceph clients + (for example, mounted Ceph File Systems, mounted Ceph Block Devices) + as well as RADOS Gateway (RGW) daemons. :example: ``objecter_inflight_ops = 512`` -Sections may also specify an individual daemon or client name. For example, +Configuration sections can also specify an individual daemon or client name. For example, ``mon.foo``, ``osd.123``, and ``client.smith`` are all valid section names. -Any given daemon will draw its settings from the global section, the -daemon or client type section, and the section sharing its name. -Settings in the most-specific section take precedence, so for example -if the same option is specified in both :confsec:`global`, :confsec:`mon`, and -``mon.foo`` on the same source (i.e., in the same configurationfile), -the ``mon.foo`` value will be used. +Any given daemon will draw its settings from the global section, the daemon- or +client-type section, and the section sharing its name. Settings in the +most-specific section take precedence so precedence: for example, if the same +option is specified in both :confsec:`global`, :confsec:`mon`, and ``mon.foo`` +on the same source (i.e. that is, in the same configuration file), the +``mon.foo`` setting will be used. If multiple values of the same configuration option are specified in the same -section, the last value wins. - -Note that values from the local configuration file always take -precedence over values from the monitor configuration database, -regardless of which section they appear in. +section, the last value specified takes precedence. +Note that values from the local configuration file always take precedence over +values from the monitor configuration database, regardless of the section in +which they appear. .. _ceph-metavariables: Metavariables ============= -Metavariables simplify Ceph Storage Cluster configuration -dramatically. When a metavariable is set in a configuration value, -Ceph expands the metavariable into a concrete value at the time the -configuration value is used. Ceph metavariables are similar to variable expansion in the Bash shell. +Metavariables dramatically simplify Ceph storage cluster configuration. When a +metavariable is set in a configuration value, Ceph expands the metavariable at +the time the configuration value is used. In this way, Ceph metavariables +behave similarly to the way that variable expansion works in the Bash shell. -Ceph supports the following metavariables: +Ceph supports the following metavariables: .. describe:: $cluster @@ -204,7 +203,7 @@ Ceph supports the following metavariables: .. describe:: $type - Expands to a daemon or process type (e.g., ``mds``, ``osd``, or ``mon``) + Expands to a daemon or process type (for example, ``mds``, ``osd``, or ``mon``) :example: ``/var/lib/ceph/$type`` @@ -233,33 +232,32 @@ Ceph supports the following metavariables: :example: ``/var/run/ceph/$cluster-$name-$pid.asok`` - -The Configuration File -====================== +Ceph configuration file +======================= On startup, Ceph processes search for a configuration file in the following locations: -#. ``$CEPH_CONF`` (*i.e.,* the path following the ``$CEPH_CONF`` +#. ``$CEPH_CONF`` (that is, the path following the ``$CEPH_CONF`` environment variable) -#. ``-c path/path`` (*i.e.,* the ``-c`` command line argument) +#. ``-c path/path`` (that is, the ``-c`` command line argument) #. ``/etc/ceph/$cluster.conf`` #. ``~/.ceph/$cluster.conf`` -#. ``./$cluster.conf`` (*i.e.,* in the current working directory) +#. ``./$cluster.conf`` (that is, in the current working directory) #. On FreeBSD systems only, ``/usr/local/etc/ceph/$cluster.conf`` -where ``$cluster`` is the cluster's name (default ``ceph``). +Here ``$cluster`` is the cluster's name (default: ``ceph``). -The Ceph configuration file uses an *ini* style syntax. You can add comment -text after a pound sign (#) or a semi-colon (;). For example: +The Ceph configuration file uses an ``ini`` style syntax. You can add "comment +text" after a pound sign (#) or a semi-colon semicolon (;). For example: .. code-block:: ini - # <--A number (#) sign precedes a comment. - ; A comment may be anything. - # Comments always follow a semi-colon (;) or a pound (#) on each line. - # The end of the line terminates a comment. - # We recommend that you provide comments in your configuration file(s). + # <--A number (#) sign number sign (#) precedes a comment. + ; A comment may be anything. + # Comments always follow a semi-colon semicolon (;) or a pound sign (#) on each line. + # The end of the line terminates a comment. + # We recommend that you provide comments in your configuration file(s). .. _ceph-conf-settings: @@ -268,40 +266,41 @@ Config file section names ------------------------- The configuration file is divided into sections. Each section must begin with a -valid configuration section name (see `Configuration sections`_, above) -surrounded by square brackets. For example, +valid configuration section name (see `Configuration sections`_, above) that is +surrounded by square brackets. For example: .. code-block:: ini - [global] - debug_ms = 0 - - [osd] - debug_ms = 1 + [global] + debug_ms = 0 - [osd.1] - debug_ms = 10 + [osd] + debug_ms = 1 - [osd.2] - debug_ms = 10 + [osd.1] + debug_ms = 10 + [osd.2] + debug_ms = 10 Config file option values ------------------------- -The value of a configuration option is a string. If it is too long to -fit in a single line, you can put a backslash (``\``) at the end of line -as the line continuation marker, so the value of the option will be -the string after ``=`` in current line combined with the string in the next -line:: +The value of a configuration option is a string. If the string is too long to +fit on a single line, you can put a backslash (``\``) at the end of the line +and the backslash will act as a line continuation marker. In such a case, the +value of the option will be the string after ``=`` in the current line, +combined with the string in the next line. Here is an example:: [global] foo = long long ago\ long ago -In the example above, the value of "``foo``" would be "``long long ago long ago``". +In this example, the value of the "``foo``" option is "``long long ago long +ago``". -Normally, the option value ends with a new line, or a comment, like +An option value typically ends with either a newline or a comment. For +example: .. code-block:: ini @@ -309,100 +308,108 @@ Normally, the option value ends with a new line, or a comment, like obscure_one = difficult to explain # I will try harder in next release simpler_one = nothing to explain -In the example above, the value of "``obscure one``" would be "``difficult to explain``"; -and the value of "``simpler one`` would be "``nothing to explain``". +In this example, the value of the "``obscure one``" option is "``difficult to +explain``" and the value of the "``simpler one`` options is "``nothing to +explain``". -If an option value contains spaces, and we want to make it explicit, we -could quote the value using single or double quotes, like +When an option value contains spaces, it can be enclosed within single quotes +or double quotes in order to make its scope clear and in order to make sure +that the first space in the value is not interpreted as the end of the value. +For example: .. code-block:: ini [global] line = "to be, or not to be" -Certain characters are not allowed to be present in the option values directly. -They are ``=``, ``#``, ``;`` and ``[``. If we have to, we need to escape them, -like +In option values, there are four characters that are treated as escape +characters: ``=``, ``#``, ``;`` and ``[``. They are permitted to occur in an +option value only if they are immediately preceded by the backslash character +(``\``). For example: .. code-block:: ini [global] secret = "i love \# and \[" -Every configuration option is typed with one of the types below: +Each configuration option falls under one of the following types: .. describe:: int - 64-bit signed integer, Some SI prefixes are supported, like "K", "M", "G", - "T", "P", "E", meaning, respectively, 10\ :sup:`3`, 10\ :sup:`6`, - 10\ :sup:`9`, etc. And "B" is the only supported unit. So, "1K", "1M", "128B" and "-1" are all valid - option values. Some times, a negative value implies "unlimited" when it comes to - an option for threshold or limit. + 64-bit signed integer. Some SI suffixes are supported, such as "K", "M", + "G", "T", "P", and "E" (meaning, respectively, 10\ :sup:`3`, 10\ :sup:`6`, + 10\ :sup:`9`, etc.). "B" is the only supported unit string. Thus "1K", "1M", + "128B" and "-1" are all valid option values. When a negative value is + assigned to a threshold option, this can indicate that the option is + "unlimited" -- that is, that there is no threshold or limit in effect. :example: ``42``, ``-1`` .. describe:: uint - It is almost identical to ``integer``. But a negative value will be rejected. + This differs from ``integer`` only in that negative values are not + permitted. :example: ``256``, ``0`` .. describe:: str - Free style strings encoded in UTF-8, but some characters are not allowed. Please - reference the above notes for the details. + A string encoded in UTF-8. Certain characters are not permitted. Reference + the above notes for the details. :example: ``"hello world"``, ``"i love \#"``, ``yet-another-name`` .. describe:: boolean - one of the two values ``true`` or ``false``. But an integer is also accepted, - where "0" implies ``false``, and any non-zero values imply ``true``. + Typically either of the two values ``true`` or ``false``. However, any + integer is permitted: "0" implies ``false``, and any non-zero value implies + ``true``. :example: ``true``, ``false``, ``1``, ``0`` .. describe:: addr - a single address optionally prefixed with ``v1``, ``v2`` or ``any`` for the messenger - protocol. If the prefix is not specified, ``v2`` protocol is used. Please see - :ref:`address_formats` for more details. + A single address, optionally prefixed with ``v1``, ``v2`` or ``any`` for the + messenger protocol. If no prefix is specified, the ``v2`` protocol is used. + For more details, see :ref:`address_formats`. :example: ``v1:1.2.3.4:567``, ``v2:1.2.3.4:567``, ``1.2.3.4:567``, ``2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567``, ``[::1]:6789`` .. describe:: addrvec - a set of addresses separated by ",". The addresses can be optionally quoted with ``[`` and ``]``. + A set of addresses separated by ",". The addresses can be optionally quoted + with ``[`` and ``]``. :example: ``[v1:1.2.3.4:567,v2:1.2.3.4:568]``, ``v1:1.2.3.4:567,v1:1.2.3.14:567`` ``[2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567], [2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::568]`` .. describe:: uuid - the string format of a uuid defined by `RFC4122 `_. - And some variants are also supported, for more details, see - `Boost document `_. + The string format of a uuid defined by `RFC4122 + `_. Certain variants are also + supported: for more details, see `Boost document + `_. :example: ``f81d4fae-7dec-11d0-a765-00a0c91e6bf6`` .. describe:: size - denotes a 64-bit unsigned integer. Both SI prefixes and IEC prefixes are - supported. And "B" is the only supported unit. A negative value will be - rejected. + 64-bit unsigned integer. Both SI prefixes and IEC prefixes are supported. + "B" is the only supported unit string. Negative values are not permitted. :example: ``1Ki``, ``1K``, ``1KiB`` and ``1B``. .. describe:: secs - denotes a duration of time. By default the unit is second if not specified. - Following units of time are supported: + Denotes a duration of time. The default unit of time is the second. + The following units of time are supported: - * second: "s", "sec", "second", "seconds" - * minute: "m", "min", "minute", "minutes" - * hour: "hs", "hr", "hour", "hours" - * day: "d", "day", "days" - * week: "w", "wk", "week", "weeks" - * month: "mo", "month", "months" - * year: "y", "yr", "year", "years" + * second: ``s``, ``sec``, ``second``, ``seconds`` + * minute: ``m``, ``min``, ``minute``, ``minutes`` + * hour: ``hs``, ``hr``, ``hour``, ``hours`` + * day: ``d``, ``day``, ``days`` + * week: ``w``, ``wk``, ``week``, ``weeks`` + * month: ``mo``, ``month``, ``months`` + * year: ``y``, ``yr``, ``year``, ``years`` :example: ``1 m``, ``1m`` and ``1 week`` @@ -411,112 +418,103 @@ Every configuration option is typed with one of the types below: Monitor configuration database ============================== -The monitor cluster manages a database of configuration options that -can be consumed by the entire cluster, enabling streamlined central -configuration management for the entire system. The vast majority of -configuration options can and should be stored here for ease of -administration and transparency. +The monitor cluster manages a database of configuration options that can be +consumed by the entire cluster. This allows for streamlined central +configuration management of the entire system. For ease of administration and +transparency, the vast majority of configuration options can and should be +stored in this database. -A handful of settings may still need to be stored in local -configuration files because they affect the ability to connect to the -monitors, authenticate, and fetch configuration information. In most -cases this is limited to the ``mon_host`` option, although this can -also be avoided through the use of DNS SRV records. +Some settings might need to be stored in local configuration files because they +affect the ability of the process to connect to the monitors, to authenticate, +and to fetch configuration information. In most cases this applies only to the +``mon_host`` option. This issue can be avoided by using :ref:`DNS SRV +records`. Sections and masks ------------------ -Configuration options stored by the monitor can live in a global -section, daemon type section, or specific daemon section, just like -options in a configuration file can. +Configuration options stored by the monitor can be stored in a global section, +in a daemon-type section, or in a specific daemon section. In this, they are +no different from the options in a configuration file. -In addition, options may also have a *mask* associated with them to -further restrict which daemons or clients the option applies to. -Masks take two forms: +In addition, options may have a *mask* associated with them to further restrict +which daemons or clients the option applies to. Masks take two forms: -#. ``type:location`` where *type* is a CRUSH property like `rack` or - `host`, and *location* is a value for that property. For example, +#. ``type:location`` where ``type`` is a CRUSH property like ``rack`` or + ``host``, and ``location`` is a value for that property. For example, ``host:foo`` would limit the option only to daemons or clients running on a particular host. -#. ``class:device-class`` where *device-class* is the name of a CRUSH - device class (e.g., ``hdd`` or ``ssd``). For example, +#. ``class:device-class`` where ``device-class`` is the name of a CRUSH + device class (for example, ``hdd`` or ``ssd``). For example, ``class:ssd`` would limit the option only to OSDs backed by SSDs. - (This mask has no effect for non-OSD daemons or clients.) - -When setting a configuration option, the `who` may be a section name, -a mask, or a combination of both separated by a slash (``/``) -character. For example, ``osd/rack:foo`` would mean all OSD daemons -in the ``foo`` rack. + (This mask has no effect on non-OSD daemons or clients.) -When viewing configuration options, the section name and mask are -generally separated out into separate fields or columns to ease readability. +In commands that specify a configuration option, the argument of the option (in +the following examples, this is the "who" string) may be a section name, a +mask, or a combination of both separated by a slash character (``/``). For +example, ``osd/rack:foo`` would refer to all OSD daemons in the ``foo`` rack. +When configuration options are shown, the section name and mask are presented +in separate fields or columns to make them more readable. Commands -------- The following CLI commands are used to configure the cluster: -* ``ceph config dump`` will dump the entire monitors' configuration +* ``ceph config dump`` dumps the entire monitor configuration database for the cluster. -* ``ceph config get `` will dump configuration options stored in - the monitors' configuration database for a specific daemon or client - (e.g., ``mds.a``). +* ``ceph config get `` dumps the configuration options stored in + the monitor configuration database for a specific daemon or client + (for example, ``mds.a``). -* ``ceph config get