]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/check_counter.py
update ceph source to reef 18.1.2
[ceph.git] / ceph / qa / tasks / check_counter.py
1
2 import logging
3 import json
4
5 from teuthology.task import Task
6 from teuthology import misc
7
8 from tasks import ceph_manager
9
10 log = logging.getLogger(__name__)
11
12
13 class CheckCounter(Task):
14 """
15 Use this task to validate that some daemon perf counters were
16 incremented by the nested tasks.
17
18 Config:
19 'cluster_name': optional, specify which cluster
20 'target': dictionary of daemon type to list of performance counters.
21 'dry_run': just log the value of the counters, don't fail if they
22 aren't nonzero.
23
24 Success condition is that for all of the named counters, at least
25 one of the daemons of that type has the counter nonzero.
26
27 Example to check cephfs dirfrag splits are happening:
28 - install:
29 - ceph:
30 - ceph-fuse:
31 - check-counter:
32 counters:
33 mds:
34 - "mds.dir_split"
35 -
36 name: "mds.dir_update"
37 min: 3
38 - workunit: ...
39 """
40 @property
41 def admin_remote(self):
42 first_mon = misc.get_first_mon(self.ctx, None)
43 (result,) = self.ctx.cluster.only(first_mon).remotes.keys()
44 return result
45
46 def start(self):
47 log.info("START")
48
49 def end(self):
50 overrides = self.ctx.config.get('overrides', {})
51 misc.deep_merge(self.config, overrides.get('check-counter', {}))
52
53 cluster_name = self.config.get('cluster_name', None)
54 dry_run = self.config.get('dry_run', False)
55 targets = self.config.get('counters', {})
56
57 if cluster_name is None:
58 cluster_name = next(iter(self.ctx.managers.keys()))
59
60
61 mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager'))
62 active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"]
63
64 for daemon_type, counters in targets.items():
65 # List of 'a', 'b', 'c'...
66 daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
67 daemons = dict([(daemon_id,
68 self.ctx.daemons.get_daemon(daemon_type, daemon_id))
69 for daemon_id in daemon_ids])
70
71 expected = set()
72 seen = set()
73
74 for daemon_id, daemon in daemons.items():
75 if not daemon.running():
76 log.info("Ignoring daemon {0}, it isn't running".format(daemon_id))
77 continue
78 elif daemon_type == 'mgr' and daemon_id != active_mgr:
79 continue
80 else:
81 log.debug("Getting stats from {0}".format(daemon_id))
82
83 manager = self.ctx.managers[cluster_name]
84 proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
85 response_data = proc.stdout.getvalue().strip()
86 if response_data:
87 perf_dump = json.loads(response_data)
88 else:
89 log.warning("No admin socket response from {0}, skipping".format(daemon_id))
90 continue
91
92 minval = ''
93 expected_val = ''
94 for counter in counters:
95 if isinstance(counter, dict):
96 name = counter['name']
97 if 'min' in counter:
98 minval = counter['min']
99 if 'expected_val' in counter:
100 expected_val = counter['expected_val']
101 else:
102 name = counter
103 minval = 1
104 expected.add(name)
105
106 val = perf_dump
107 for key in name.split('.'):
108 if key not in val:
109 log.warning(f"Counter '{name}' not found on daemon {daemon_type}.{daemon_id}")
110 val = None
111 break
112
113 val = val[key]
114
115 if val is not None:
116 log.info(f"Daemon {daemon_type}.{daemon_id} {name}={val}")
117 if isinstance(minval, int) and val >= minval:
118 seen.add(name)
119 elif isinstance(expected_val, int) and val == expected_val:
120 seen.add(name)
121
122 if not dry_run:
123 unseen = set(expected) - set(seen)
124 if unseen:
125 raise RuntimeError("The following counters failed to be set "
126 "on {0} daemons: {1}".format(
127 daemon_type, unseen
128 ))
129
130 task = CheckCounter