]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mon_clock_skew_check.py
891e6ec484ede7c5f7c2b4480a6be2678e4e32a4
2 Handle clock skews in monitors.
9 from StringIO
import StringIO
10 from teuthology
import misc
as teuthology
12 log
= logging
.getLogger(__name__
)
16 Periodically check if there are any clock skews among the monitors in the
17 quorum. By default, assume no skews are supposed to exist; that can be
18 changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
19 then we will always succeed and only report skews if any are found.
21 This class does not spawn a thread. It assumes that, if that is indeed
22 wanted, it should be done by a third party (for instance, the task using
23 this class). We intend it as such in order to reuse this class if need be.
25 This task accepts the following options:
27 interval amount of seconds to wait in-between checks. (default: 30.0)
28 max-skew maximum skew, in seconds, that is considered tolerable before
29 issuing a warning. (default: 0.05)
30 expect-skew 'true' or 'false', to indicate whether to expect a skew during
31 the run or not. If 'true', the test will fail if no skew is
32 found, and succeed if a skew is indeed found; if 'false', it's
33 the other way around. (default: false)
34 never-fail Don't fail the run if a skew is detected and we weren't
35 expecting it, or if no skew is detected and we were expecting
38 at-least-once Runs at least once, even if we are told to stop.
40 at-least-once-timeout If we were told to stop but we are attempting to
41 run at least once, timeout after this many seconds.
45 Expect a skew higher than 0.05 seconds, but only report it without
46 failing the teuthology run.
48 - mon_clock_skew_check:
55 def __init__(self
, ctx
, manager
, config
, logger
):
57 self
.manager
= manager
63 if self
.config
is None:
66 self
.check_interval
= float(self
.config
.get('interval', 30.0))
68 first_mon
= teuthology
.get_first_mon(ctx
, config
)
69 remote
= ctx
.cluster
.only(first_mon
).remotes
.keys()[0]
75 '--show-config-value', 'mon_clock_drift_allowed'
76 ], stdout
=StringIO(), wait
=True
78 self
.max_skew
= self
.config
.get('max-skew', float(proc
.stdout
.getvalue()))
80 self
.expect_skew
= self
.config
.get('expect-skew', False)
81 self
.never_fail
= self
.config
.get('never-fail', False)
82 self
.at_least_once
= self
.config
.get('at-least-once', True)
83 self
.at_least_once_timeout
= self
.config
.get('at-least-once-timeout', 600.0)
87 locally define logger for info messages
93 locally define logger for warnings
99 locally define logger for debug messages
106 Break out of the do_check loop.
110 def sleep_interval(self
):
112 If a sleep interval is set, sleep for that amount of time.
114 if self
.check_interval
> 0.0:
115 self
.debug('sleeping for {s} seconds'.format(
116 s
=self
.check_interval
))
117 time
.sleep(self
.check_interval
)
119 def print_skews(self
, skews
):
125 self
.info('---------- found {n} skews ----------'.format(n
=total
))
126 for mon_id
, values
in skews
.iteritems():
127 self
.info('mon.{id}: {v}'.format(id=mon_id
, v
=values
))
128 self
.info('-------------------------------------')
130 self
.info('---------- no skews were found ----------')
134 Clock skew checker. Loops until finish() is called.
136 self
.info('start checking for clock skews')
142 while not self
.stopping
or (self
.at_least_once
and not ran_once
):
144 if self
.at_least_once
and not ran_once
and self
.stopping
:
145 if started_on
is None:
146 self
.info('kicking-off timeout (if any)')
147 started_on
= time
.time()
148 elif self
.at_least_once_timeout
> 0.0:
149 assert time
.time() - started_on
< self
.at_least_once_timeout
, \
150 'failed to obtain a timecheck before timeout expired'
152 quorum_size
= len(teuthology
.get_mon_names(self
.ctx
))
153 self
.manager
.wait_for_mon_quorum_size(quorum_size
)
155 health
= self
.manager
.get_mon_health(True)
156 timechecks
= health
['timechecks']
160 if timechecks
['round_status'] == 'finished':
161 assert (timechecks
['round'] % 2) == 0, \
162 'timecheck marked as finished but round ' \
163 'disagrees (r {r})'.format(
164 r
=timechecks
['round'])
167 assert timechecks
['round_status'] == 'on-going', \
168 'timecheck status expected \'on-going\' ' \
169 'but found \'{s}\' instead'.format(
170 s
=timechecks
['round_status'])
171 if 'mons' in timechecks
.keys() and len(timechecks
['mons']) > 1:
172 self
.info('round still on-going, but there are available reports')
174 self
.info('no timechecks available just yet')
175 self
.sleep_interval()
178 assert len(timechecks
['mons']) > 1, \
179 'there are not enough reported timechecks; ' \
180 'expected > 1 found {n}'.format(n
=len(timechecks
['mons']))
182 for check
in timechecks
['mons']:
183 mon_skew
= float(check
['skew'])
184 mon_health
= check
['health']
185 mon_id
= check
['name']
186 if abs(mon_skew
) > self
.max_skew
:
187 assert mon_health
== 'HEALTH_WARN', \
188 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
189 id=mon_id
,health
=mon_health
,s
=abs(mon_skew
),ms
=self
.max_skew
)
191 log_str
= 'mon.{id} with skew {s} > max {ms}'.format(
192 id=mon_id
,s
=abs(mon_skew
),ms
=self
.max_skew
)
194 """ add to skew list """
195 details
= check
['details']
196 skews
[mon_id
] = {'skew': mon_skew
, 'details': details
}
199 self
.info('expected skew: {str}'.format(str=log_str
))
201 self
.warn('unexpected skew: {str}'.format(str=log_str
))
203 if clean_check
or (self
.expect_skew
and len(skews
) > 0):
205 self
.print_skews(skews
)
206 self
.sleep_interval()
209 self
.print_skews(skews
)
216 error_str
= 'We were expecting a skew, but none was found!'
220 error_str
= 'We were not expecting a skew, but we did find it!'
225 if not self
.never_fail
:
226 assert False, error_str
228 @contextlib.contextmanager
229 def task(ctx
, config
):
231 Use clas ClockSkewCheck to check for clock skews on the monitors.
232 This task will spawn a thread running ClockSkewCheck's do_check().
234 All the configuration will be directly handled by ClockSkewCheck,
235 so please refer to the class documentation for further information.
239 assert isinstance(config
, dict), \
240 'mon_clock_skew_check task only accepts a dict for configuration'
241 log
.info('Beginning mon_clock_skew_check...')
242 first_mon
= teuthology
.get_first_mon(ctx
, config
)
243 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
244 manager
= ceph_manager
.CephManager(
247 logger
=log
.getChild('ceph_manager'),
250 skew_check
= ClockSkewCheck(ctx
,
252 logger
=log
.getChild('mon_clock_skew_check'))
253 skew_check_thread
= gevent
.spawn(skew_check
.do_check
)
257 log
.info('joining mon_clock_skew_check')
259 skew_check_thread
.get()