]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mon_clock_skew_check.py
891e6ec484ede7c5f7c2b4480a6be2678e4e32a4
[ceph.git] / ceph / qa / tasks / mon_clock_skew_check.py
1 """
2 Handle clock skews in monitors.
3 """
4 import logging
5 import contextlib
6 import ceph_manager
7 import time
8 import gevent
9 from StringIO import StringIO
10 from teuthology import misc as teuthology
11
12 log = logging.getLogger(__name__)
13
14 class ClockSkewCheck:
15 """
16 Periodically check if there are any clock skews among the monitors in the
17 quorum. By default, assume no skews are supposed to exist; that can be
18 changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
19 then we will always succeed and only report skews if any are found.
20
21 This class does not spawn a thread. It assumes that, if that is indeed
22 wanted, it should be done by a third party (for instance, the task using
23 this class). We intend it as such in order to reuse this class if need be.
24
25 This task accepts the following options:
26
27 interval amount of seconds to wait in-between checks. (default: 30.0)
28 max-skew maximum skew, in seconds, that is considered tolerable before
29 issuing a warning. (default: 0.05)
30 expect-skew 'true' or 'false', to indicate whether to expect a skew during
31 the run or not. If 'true', the test will fail if no skew is
32 found, and succeed if a skew is indeed found; if 'false', it's
33 the other way around. (default: false)
34 never-fail Don't fail the run if a skew is detected and we weren't
35 expecting it, or if no skew is detected and we were expecting
36 it. (default: False)
37
38 at-least-once Runs at least once, even if we are told to stop.
39 (default: True)
40 at-least-once-timeout If we were told to stop but we are attempting to
41 run at least once, timeout after this many seconds.
42 (default: 600)
43
44 Example:
45 Expect a skew higher than 0.05 seconds, but only report it without
46 failing the teuthology run.
47
48 - mon_clock_skew_check:
49 interval: 30
50 max-skew: 0.05
51 expect_skew: true
52 never-fail: true
53 """
54
55 def __init__(self, ctx, manager, config, logger):
56 self.ctx = ctx
57 self.manager = manager
58
59 self.stopping = False
60 self.logger = logger
61 self.config = config
62
63 if self.config is None:
64 self.config = dict()
65
66 self.check_interval = float(self.config.get('interval', 30.0))
67
68 first_mon = teuthology.get_first_mon(ctx, config)
69 remote = ctx.cluster.only(first_mon).remotes.keys()[0]
70 proc = remote.run(
71 args=[
72 'sudo',
73 'ceph-mon',
74 '-i', first_mon[4:],
75 '--show-config-value', 'mon_clock_drift_allowed'
76 ], stdout=StringIO(), wait=True
77 )
78 self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))
79
80 self.expect_skew = self.config.get('expect-skew', False)
81 self.never_fail = self.config.get('never-fail', False)
82 self.at_least_once = self.config.get('at-least-once', True)
83 self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
84
85 def info(self, x):
86 """
87 locally define logger for info messages
88 """
89 self.logger.info(x)
90
91 def warn(self, x):
92 """
93 locally define logger for warnings
94 """
95 self.logger.warn(x)
96
97 def debug(self, x):
98 """
99 locally define logger for debug messages
100 """
101 self.logger.info(x)
102 self.logger.debug(x)
103
104 def finish(self):
105 """
106 Break out of the do_check loop.
107 """
108 self.stopping = True
109
110 def sleep_interval(self):
111 """
112 If a sleep interval is set, sleep for that amount of time.
113 """
114 if self.check_interval > 0.0:
115 self.debug('sleeping for {s} seconds'.format(
116 s=self.check_interval))
117 time.sleep(self.check_interval)
118
119 def print_skews(self, skews):
120 """
121 Display skew values.
122 """
123 total = len(skews)
124 if total > 0:
125 self.info('---------- found {n} skews ----------'.format(n=total))
126 for mon_id, values in skews.iteritems():
127 self.info('mon.{id}: {v}'.format(id=mon_id, v=values))
128 self.info('-------------------------------------')
129 else:
130 self.info('---------- no skews were found ----------')
131
132 def do_check(self):
133 """
134 Clock skew checker. Loops until finish() is called.
135 """
136 self.info('start checking for clock skews')
137 skews = dict()
138 ran_once = False
139
140 started_on = None
141
142 while not self.stopping or (self.at_least_once and not ran_once):
143
144 if self.at_least_once and not ran_once and self.stopping:
145 if started_on is None:
146 self.info('kicking-off timeout (if any)')
147 started_on = time.time()
148 elif self.at_least_once_timeout > 0.0:
149 assert time.time() - started_on < self.at_least_once_timeout, \
150 'failed to obtain a timecheck before timeout expired'
151
152 quorum_size = len(teuthology.get_mon_names(self.ctx))
153 self.manager.wait_for_mon_quorum_size(quorum_size)
154
155 health = self.manager.get_mon_health(True)
156 timechecks = health['timechecks']
157
158 clean_check = False
159
160 if timechecks['round_status'] == 'finished':
161 assert (timechecks['round'] % 2) == 0, \
162 'timecheck marked as finished but round ' \
163 'disagrees (r {r})'.format(
164 r=timechecks['round'])
165 clean_check = True
166 else:
167 assert timechecks['round_status'] == 'on-going', \
168 'timecheck status expected \'on-going\' ' \
169 'but found \'{s}\' instead'.format(
170 s=timechecks['round_status'])
171 if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
172 self.info('round still on-going, but there are available reports')
173 else:
174 self.info('no timechecks available just yet')
175 self.sleep_interval()
176 continue
177
178 assert len(timechecks['mons']) > 1, \
179 'there are not enough reported timechecks; ' \
180 'expected > 1 found {n}'.format(n=len(timechecks['mons']))
181
182 for check in timechecks['mons']:
183 mon_skew = float(check['skew'])
184 mon_health = check['health']
185 mon_id = check['name']
186 if abs(mon_skew) > self.max_skew:
187 assert mon_health == 'HEALTH_WARN', \
188 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
189 id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)
190
191 log_str = 'mon.{id} with skew {s} > max {ms}'.format(
192 id=mon_id,s=abs(mon_skew),ms=self.max_skew)
193
194 """ add to skew list """
195 details = check['details']
196 skews[mon_id] = {'skew': mon_skew, 'details': details}
197
198 if self.expect_skew:
199 self.info('expected skew: {str}'.format(str=log_str))
200 else:
201 self.warn('unexpected skew: {str}'.format(str=log_str))
202
203 if clean_check or (self.expect_skew and len(skews) > 0):
204 ran_once = True
205 self.print_skews(skews)
206 self.sleep_interval()
207
208 total = len(skews)
209 self.print_skews(skews)
210
211 error_str = ''
212 found_error = False
213
214 if self.expect_skew:
215 if total == 0:
216 error_str = 'We were expecting a skew, but none was found!'
217 found_error = True
218 else:
219 if total > 0:
220 error_str = 'We were not expecting a skew, but we did find it!'
221 found_error = True
222
223 if found_error:
224 self.info(error_str)
225 if not self.never_fail:
226 assert False, error_str
227
228 @contextlib.contextmanager
229 def task(ctx, config):
230 """
231 Use clas ClockSkewCheck to check for clock skews on the monitors.
232 This task will spawn a thread running ClockSkewCheck's do_check().
233
234 All the configuration will be directly handled by ClockSkewCheck,
235 so please refer to the class documentation for further information.
236 """
237 if config is None:
238 config = {}
239 assert isinstance(config, dict), \
240 'mon_clock_skew_check task only accepts a dict for configuration'
241 log.info('Beginning mon_clock_skew_check...')
242 first_mon = teuthology.get_first_mon(ctx, config)
243 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
244 manager = ceph_manager.CephManager(
245 mon,
246 ctx=ctx,
247 logger=log.getChild('ceph_manager'),
248 )
249
250 skew_check = ClockSkewCheck(ctx,
251 manager, config,
252 logger=log.getChild('mon_clock_skew_check'))
253 skew_check_thread = gevent.spawn(skew_check.do_check)
254 try:
255 yield
256 finally:
257 log.info('joining mon_clock_skew_check')
258 skew_check.finish()
259 skew_check_thread.get()
260
261