import ceph quincy 17.2.1

[ceph.git] / ceph / qa / tasks / cephfs / test_mds_metrics.py
diff --git a/ceph/qa/tasks/cephfs/test_mds_metrics.py b/ceph/qa/tasks/cephfs/test_mds_metrics.py

index be680bb8600d580e78aa8f4456b18dd059b64ac6..727b80c6a91bf63e2c6df8a3ad980175b71c0616 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_mds_metrics.py
+++ b/ceph/qa/tasks/cephfs/test_mds_metrics.py
@@ -5,7 +5,7 @@ import random
  import logging
  import errno
  
-from teuthology.contextutil import safe_while
+from teuthology.contextutil import safe_while, MaxWhileTries
  from teuthology.exceptions import CommandFailedError
  from tasks.cephfs.cephfs_test_case import CephFSTestCase
  
@@ -100,6 +100,24 @@ class TestMDSMetrics(CephFSTestCase):
                      break
          return done, metrics
  
+    def _setup_fs(self, fs_name):
+        fs_a = self.mds_cluster.newfs(name=fs_name)
+        
+        self.mds_cluster.mds_restart()
+
+        # Wait for filesystem to go healthy
+        fs_a.wait_for_daemons()
+        
+        # Reconfigure client auth caps
+        for mount in self.mounts:
+            self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                'auth', 'caps', f"client.{mount.client_id}",
+                'mds', 'allow',
+                'mon', 'allow r',
+                'osd', f'allow rw pool={fs_a.get_data_pool_name()}')
+
+        return fs_a
+
      # basic check to verify if we get back metrics from each active mds rank
  
      def test_metrics_from_rank(self):
@@ -394,3 +412,102 @@ class TestMDSMetrics(CephFSTestCase):
                  raise
          else:
              raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_ip")
+
+    def test_perf_stats_stale_metrics(self):
+        """
+        That `ceph fs perf stats` doesn't output stale metrics after the rank0 MDS failover
+        """
+        # validate
+        valid, metrics = self._get_metrics(self.verify_mds_metrics(
+            active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug(f'metrics={metrics}')
+        self.assertTrue(valid)
+
+        #mount_a and mount_b are the clients mounted for TestMDSMetrics. So get their
+        #entries from the global_metrics.
+        client_a_name = f'client.{self.mount_a.get_global_id()}'
+        client_b_name = f'client.{self.mount_b.get_global_id()}'
+
+        global_metrics = metrics['global_metrics']
+        client_a_metrics = global_metrics[client_a_name]
+        client_b_metrics = global_metrics[client_b_name]
+
+        #fail rank0 mds
+        self.fs.rank_fail(rank=0)
+
+        # Wait for 10 seconds for the failover to complete and
+        # the mgr to get initial metrics from the new rank0 mds.
+        time.sleep(10)
+
+        fscid = self.fs.id
+
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fscid)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fscid)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        try:
+            valid, metrics_new = self._get_metrics(self.verify_mds_metrics(
+                active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+            log.debug(f'metrics={metrics_new}')
+            self.assertTrue(valid)
+
+            global_metrics = metrics_new['global_metrics']
+            client_a_metrics_new = global_metrics[client_a_name]
+            client_b_metrics_new = global_metrics[client_b_name]
+
+            #the metrics should be different for the test to succeed.
+            self.assertNotEqual(client_a_metrics, client_a_metrics_new)
+            self.assertNotEqual(client_b_metrics, client_b_metrics_new)
+        except MaxWhileTries:
+            raise RuntimeError("Failed to fetch `ceph fs perf stats` metrics")
+        finally:
+            # cleanup test directories
+            self._cleanup_test_dirs()
+
+    def test_client_metrics_and_metadata(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+            "enable_multiple", "true",
+            "--yes-i-really-mean-it")
+            
+        #creating filesystem
+        fs_a = self._setup_fs(fs_name = "fs1")
+
+        # Mount a client on fs_a
+        self.mount_a.mount_wait(cephfs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        #creating another filesystem
+        fs_b = self._setup_fs(fs_name = "fs2")
+
+        # Mount a client on fs_b
+        self.mount_b.mount_wait(cephfs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30)
+        log.debug(f"metrics={metrics}")
+        self.assertTrue(valid)
+        
+        client_metadata = metrics['client_metadata']
+
+        for i in client_metadata:
+            if not (client_metadata[i]['hostname']):
+                raise RuntimeError("hostname not found!")
+            if not (client_metadata[i]['valid_metrics']):
+                raise RuntimeError("valid_metrics not found!")
+