]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mds_thrash.py
2 Thrash mds by simulating failures
12 from gevent
import sleep
13 from gevent
. greenlet
import Greenlet
14 from gevent
. event
import Event
15 from teuthology
import misc
as teuthology
17 from tasks
. cephfs
. filesystem
import MDSCluster
, Filesystem
19 log
= logging
. getLogger ( __name__
)
21 class DaemonWatchdog ( Greenlet
):
25 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
26 not intentional), then the watchdog will unmount file systems and send
27 SIGTERM to all daemons. The duration of an extended failure is configurable
28 with watchdog_daemon_timeout.
30 watchdog_daemon_timeout [default: 300]: number of seconds a daemon
31 is allowed to be failed before the watchdog will bark.
34 def __init__ ( self
, ctx
, manager
, config
, thrashers
):
35 Greenlet
.__ init
__ ( self
)
39 self
. logger
= log
. getChild ( 'daemon_watchdog' )
40 self
. manager
= manager
41 self
. name
= 'watchdog'
42 self
. stopping
= Event ()
43 self
. thrashers
= thrashers
48 except Exception as e
:
49 # See _run exception comment for MDSThrasher
51 self
. logger
. exception ( "exception:" )
52 # allow successful completion so gevent doesn't see an exception...
55 """Write data to logger"""
62 self
. log ( "BARK! unmounting mounts and killing all daemons" )
63 for mount
in self
. ctx
. mounts
. values ():
65 mount
. umount_wait ( force
= True )
67 self
. logger
. exception ( "ignoring exception:" )
69 daemons
. extend ( filter ( lambda daemon
: daemon
. running () and not daemon
. proc
. finished
, self
. ctx
. daemons
. iter_daemons_of_role ( 'mds' , cluster
= self
. manager
. cluster
)))
70 daemons
. extend ( filter ( lambda daemon
: daemon
. running () and not daemon
. proc
. finished
, self
. ctx
. daemons
. iter_daemons_of_role ( 'mon' , cluster
= self
. manager
. cluster
)))
71 for daemon
in daemons
:
73 daemon
. signal ( signal
. SIGTERM
)
75 self
. logger
. exception ( "ignoring exception:" )
78 self
. log ( "watchdog starting" )
79 daemon_timeout
= int ( self
. config
. get ( 'watchdog_daemon_timeout' , 300 ))
80 daemon_failure_time
= {}
81 while not self
. stopping
. is_set ():
85 mons
= self
. ctx
. daemons
. iter_daemons_of_role ( 'mon' , cluster
= self
. manager
. cluster
)
86 mdss
= self
. ctx
. daemons
. iter_daemons_of_role ( 'mds' , cluster
= self
. manager
. cluster
)
87 clients
= self
. ctx
. daemons
. iter_daemons_of_role ( 'client' , cluster
= self
. manager
. cluster
)
90 # self.log("mon daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
92 # self.log("mds daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
95 daemon_failures
. extend ( filter ( lambda daemon
: daemon
. running () and daemon
. proc
. finished
, mons
))
96 daemon_failures
. extend ( filter ( lambda daemon
: daemon
. running () and daemon
. proc
. finished
, mdss
))
97 for daemon
in daemon_failures
:
98 name
= daemon
. role
+ '.' + daemon
. id_
99 dt
= daemon_failure_time
. setdefault ( name
, ( daemon
, now
))
100 assert dt
[ 0 ] is daemon
102 self
. log ( "daemon {name} is failed for ~{t:.0f}s" . format ( name
= name
, t
= delta
))
103 if delta
> daemon_timeout
:
106 # If a daemon is no longer failed, remove it from tracking:
107 for name
in daemon_failure_time
. keys ():
108 if name
not in [ d
. role
+ '.' + d
. id_
for d
in daemon_failures
]:
109 self
. log ( "daemon {name} has been restored" . format ( name
= name
))
110 del daemon_failure_time
[ name
]
112 for thrasher
in self
. thrashers
:
113 if thrasher
. e
is not None :
114 self
. log ( "thrasher on fs. {name} failed" . format ( name
= thrasher
. fs
. name
))
123 self
. log ( "watchdog finished" )
125 class MDSThrasher ( Greenlet
):
129 The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
131 The config is optional. Many of the config parameters are a a maximum value
132 to use when selecting a random value from a range. To always use the maximum
133 value, set no_random to true. The config is a dict containing some or all of:
135 max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
138 max_thrash_delay: [default: 30] maximum number of seconds to delay before
141 max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
142 the replay state before thrashing.
144 max_revive_delay: [default: 10] maximum number of seconds to delay before
145 bringing back a thrashed MDS.
147 randomize: [default: true] enables randomization and use the max/min values
149 seed: [no default] seed the random number generator
151 thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
152 during replay. Value should be between 0.0 and 1.0.
154 thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds
155 cluster will be modified to a value [1, current) or (current, starting
156 max_mds]. Value should be between 0.0 and 1.0.
158 thrash_while_stopping: [default: false] thrash an MDS while there
159 are MDS in up:stopping (because max_mds was changed and some
160 MDS were deactivated).
162 thrash_weights: allows specific MDSs to be thrashed more/less frequently.
163 This option overrides anything specified by max_thrash. This option is a
164 dict containing mds.x: weight pairs. For example, [mds.a: 0.7, mds.b:
165 0.3, mds.c: 0.0]. Each weight is a value from 0.0 to 1.0. Any MDSs not
166 specified will be automatically given a weight of 0.0 (not thrashed).
167 For a given MDS, by default the trasher delays for up to
168 max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
169 If a non-zero weight is specified for an MDS, for each iteration the
170 thrasher chooses whether to thrash during that iteration based on a
171 random value [0-1] not exceeding the weight of that MDS.
176 The following example sets the likelihood that mds.a will be thrashed
177 to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the
178 likelihood that an MDS will be thrashed in replay to 40%.
179 Thrash weights do not have to sum to 1.
187 thrash_in_replay: 0.4
193 The following example disables randomization, and uses the max delay values:
200 max_replay_thrash_delay: 4
204 def __init__ ( self
, ctx
, manager
, config
, fs
, max_mds
):
205 Greenlet
.__ init
__ ( self
)
210 self
. logger
= log
. getChild ( 'fs.[ {f} ]' . format ( f
= fs
. name
))
212 self
. manager
= manager
213 self
. max_mds
= max_mds
214 self
. name
= 'thrasher.fs.[ {f} ]' . format ( f
= fs
. name
)
215 self
. stopping
= Event ()
217 self
. randomize
= bool ( self
. config
. get ( 'randomize' , True ))
218 self
. thrash_max_mds
= float ( self
. config
. get ( 'thrash_max_mds' , 0.05 ))
219 self
. max_thrash
= int ( self
. config
. get ( 'max_thrash' , 1 ))
220 self
. max_thrash_delay
= float ( self
. config
. get ( 'thrash_delay' , 120.0 ))
221 self
. thrash_in_replay
= float ( self
. config
. get ( 'thrash_in_replay' , False ))
222 assert self
. thrash_in_replay
>= 0.0 and self
. thrash_in_replay
<= 1.0 , 'thrash_in_replay ( {v} ) must be between [0.0, 1.0]' . format (
223 v
= self
. thrash_in_replay
)
224 self
. max_replay_thrash_delay
= float ( self
. config
. get ( 'max_replay_thrash_delay' , 4.0 ))
225 self
. max_revive_delay
= float ( self
. config
. get ( 'max_revive_delay' , 10.0 ))
230 except Exception as e
:
231 # Log exceptions here so we get the full backtrace (gevent loses them).
232 # Also allow successful completion as gevent exception handling is a broken mess:
234 # 2017-02-03T14:34:01.259 CRITICAL:root: File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051)
235 # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error
236 # self.print_exception(context, type, value, tb)
237 # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception
238 # traceback.print_exception(type, value, tb, file=errstream)
239 # File "/usr/lib/python2.7/traceback.py", line 124, in print_exception
240 # _print(file, 'Traceback (most recent call last):')
241 # File "/usr/lib/python2.7/traceback.py", line 13, in _print
242 # file.write(str+terminator)
243 # 2017-02-03T14:34:01.261 CRITICAL:root:IOError
245 self
. logger
. exception ( "exception:" )
246 # allow successful completion so gevent doesn't see an exception...
249 """Write data to logger assigned to this MDThrasher"""
255 def kill_mds ( self
, mds
):
256 if self
. config
. get ( 'powercycle' ):
257 ( remote
,) = ( self
. ctx
. cluster
. only ( 'mds. {m} ' . format ( m
= mds
)).
259 self
. log ( 'kill_mds on mds. {m} doing powercycle of {s} ' .
260 format ( m
= mds
, s
= remote
. name
))
261 self
._ assert
_ ipmi
( remote
)
262 remote
. console
. power_off ()
264 self
. ctx
. daemons
. get_daemon ( 'mds' , mds
). stop ()
267 def _assert_ipmi ( remote
):
268 assert remote
. console
. has_ipmi_credentials
, (
269 "powercycling requested but RemoteConsole is not "
270 "initialized. Check ipmi config." )
272 def revive_mds ( self
, mds
):
274 Revive mds -- do an ipmpi powercycle (if indicated by the config)
277 if self
. config
. get ( 'powercycle' ):
278 ( remote
,) = ( self
. ctx
. cluster
. only ( 'mds. {m} ' . format ( m
= mds
)).
280 self
. log ( 'revive_mds on mds. {m} doing powercycle of {s} ' .
281 format ( m
= mds
, s
= remote
. name
))
282 self
._ assert
_ ipmi
( remote
)
283 remote
. console
. power_on ()
284 self
. manager
. make_admin_daemon_dir ( self
. ctx
, remote
)
286 self
. ctx
. daemons
. get_daemon ( 'mds' , mds
). restart (* args
)
288 def wait_for_stable ( self
, rank
= None , gid
= None ):
289 self
. log ( 'waiting for mds cluster to stabilize...' )
290 for itercount
in itertools
. count ():
291 status
= self
. fs
. status ()
292 max_mds
= status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'max_mds' ]
293 ranks
= list ( status
. get_ranks ( self
. fs
. id ))
294 stopping
= filter ( lambda info
: "up:stopping" == info
[ 'state' ], ranks
)
295 actives
= filter ( lambda info
: "up:active" == info
[ 'state' ] and "laggy_since" not in info
, ranks
)
297 if not bool ( self
. config
. get ( 'thrash_while_stopping' , False )) and len ( stopping
) > 0 :
298 if itercount
% 5 == 0 :
299 self
. log ( 'cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)' )
303 info
= status
. get_rank ( self
. fs
. id , rank
)
304 if info
[ 'gid' ] != gid
and "up:active" == info
[ 'state' ]:
305 self
. log ( 'mds. {name} has gained rank= {rank} , replacing gid= {gid} ' . format ( name
= info
[ 'name' ], rank
= rank
, gid
= gid
))
308 pass # no rank present
309 if len ( actives
) >= max_mds
:
310 # no replacement can occur!
311 self
. log ( "cluster has %d actives (max_mds is %d ), no MDS can replace rank %d " . format ( len ( actives
), max_mds
, rank
))
314 if len ( actives
) == max_mds
:
315 self
. log ( 'mds cluster has {count} alive and active, now stable!' . format ( count
= len ( actives
)))
317 if itercount
> 300 / 2 : # 5 minutes
318 raise RuntimeError ( 'timeout waiting for cluster to stabilize' )
319 elif itercount
% 5 == 0 :
320 self
. log ( 'mds map: {status} ' . format ( status
= status
))
322 self
. log ( 'no change' )
327 Perform the random thrashing action
330 self
. log ( 'starting mds_do_thrash for fs {fs} ' . format ( fs
= self
. fs
. name
))
337 while not self
. stopping
. is_set ():
338 delay
= self
. max_thrash_delay
340 delay
= random
. randrange ( 0.0 , self
. max_thrash_delay
)
343 self
. log ( 'waiting for {delay} secs before thrashing' . format ( delay
= delay
))
344 self
. stopping
. wait ( delay
)
345 if self
. stopping
. is_set ():
348 status
= self
. fs
. status ()
350 if random
. random () <= self
. thrash_max_mds
:
351 max_mds
= status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'max_mds' ]
352 options
= range ( 1 , max_mds
)+ range ( max_mds
+ 1 , self
. max_mds
+ 1 )
354 sample
= random
. sample ( options
, 1 )
355 new_max_mds
= sample
[ 0 ]
356 self
. log ( 'thrashing max_mds: %d -> %d ' % ( max_mds
, new_max_mds
))
357 self
. fs
. set_max_mds ( new_max_mds
)
358 stats
[ 'max_mds' ] += 1
359 self
. wait_for_stable ()
362 for info
in status
. get_ranks ( self
. fs
. id ):
364 label
= 'mds.' + name
368 # if thrash_weights isn't specified and we've reached max_thrash,
371 if 'thrash_weights' not in self
. config
and count
> self
. max_thrash
:
375 if 'thrash_weights' in self
. config
:
376 weight
= self
. config
[ 'thrash_weights' ]. get ( label
, '0.0' )
377 skip
= random
. randrange ( 0.0 , 1.0 )
379 self
. log ( 'skipping thrash iteration with skip ( {skip} ) > weight ( {weight} )' . format ( skip
= skip
, weight
= weight
))
382 self
. log ( 'kill {label} (rank= {rank} )' . format ( label
= label
, rank
= rank
))
386 # wait for mon to report killed mds as crashed
387 last_laggy_since
= None
390 status
= self
. fs
. status ()
391 info
= status
. get_mds ( name
)
394 if 'laggy_since' in info
:
395 last_laggy_since
= info
[ 'laggy_since' ]
397 if any ([( f
== name
) for f
in status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'failed' ]]):
400 'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap' . format (
402 itercount
= itercount
+ 1
404 self
. log ( 'mds map: {status} ' . format ( status
= status
))
409 ' {label} reported laggy/crashed since: {since} ' . format ( label
= label
, since
= last_laggy_since
))
411 self
. log ( ' {label} down, removed from mdsmap' . format ( label
= label
, since
= last_laggy_since
))
413 # wait for a standby mds to takeover and become active
414 status
= self
. wait_for_stable ( rank
, gid
)
416 # wait for a while before restarting old active to become new
418 delay
= self
. max_revive_delay
420 delay
= random
. randrange ( 0.0 , self
. max_revive_delay
)
422 self
. log ( 'waiting for {delay} secs before reviving {label} ' . format (
423 delay
= delay
, label
= label
))
426 self
. log ( 'reviving {label} ' . format ( label
= label
))
427 self
. revive_mds ( name
)
429 for itercount
in itertools
. count ():
430 if itercount
> 300 / 2 : # 5 minutes
431 raise RuntimeError ( 'timeout waiting for MDS to revive' )
432 status
= self
. fs
. status ()
433 info
= status
. get_mds ( name
)
434 if info
and info
[ 'state' ] in ( 'up:standby' , 'up:standby-replay' , 'up:active' ):
435 self
. log ( ' {label} reported in {state} state' . format ( label
= label
, state
= info
[ 'state' ]))
438 'waiting till mds map indicates {label} is in active, standby or standby-replay' . format ( label
= label
))
442 self
. log ( "stat[' {key} '] = {value} " . format ( key
= stat
, value
= stats
[ stat
]))
444 # don't do replay thrashing right now
445 # for info in status.get_replays(self.fs.id):
446 # # this might race with replay -> active transition...
447 # if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
448 # delay = self.max_replay_thrash_delay
450 # delay = random.randrange(0.0, self.max_replay_thrash_delay)
452 # self.log('kill replaying mds.{id}'.format(id=self.to_kill))
453 # self.kill_mds(self.to_kill)
455 # delay = self.max_revive_delay
457 # delay = random.randrange(0.0, self.max_revive_delay)
459 # self.log('waiting for {delay} secs before reviving mds.{id}'.format(
460 # delay=delay, id=self.to_kill))
463 # self.log('revive mds.{id}'.format(id=self.to_kill))
464 # self.revive_mds(self.to_kill)
467 @contextlib . contextmanager
468 def task ( ctx
, config
):
470 Stress test the mds by thrashing while another task/workunit
473 Please refer to MDSThrasher class for further information on the
477 mds_cluster
= MDSCluster ( ctx
)
481 assert isinstance ( config
, dict ), \
482 'mds_thrash task only accepts a dict for configuration'
483 mdslist
= list ( teuthology
. all_roles_of_type ( ctx
. cluster
, 'mds' ))
484 assert len ( mdslist
) > 1 , \
485 'mds_thrash task requires at least 2 metadata servers'
489 seed
= int ( config
[ 'seed' ])
491 seed
= int ( time
. time ())
492 log
. info ( 'mds thrasher using random seed: {seed} ' . format ( seed
= seed
))
495 ( first
,) = ctx
. cluster
. only ( 'mds. {_id} ' . format ( _id
= mdslist
[ 0 ])). remotes
. iterkeys ()
496 manager
= ceph_manager
. CephManager (
497 first
, ctx
= ctx
, logger
= log
. getChild ( 'ceph_manager' ),
500 # make sure everyone is in active, standby, or standby-replay
501 log
. info ( 'Wait for all MDSs to reach steady state...' )
502 status
= mds_cluster
. status ()
505 for info
in status
. get_all ():
506 state
= info
[ 'state' ]
507 if state
not in ( 'up:active' , 'up:standby' , 'up:standby-replay' ):
513 status
= mds_cluster
. status ()
514 log
. info ( 'Ready to start thrashing' )
518 watchdog
= DaemonWatchdog ( ctx
, manager
, config
, thrashers
)
521 manager
. wait_for_clean ()
522 assert manager
. is_clean ()
523 for fs
in status
. get_filesystems ():
524 thrasher
= MDSThrasher ( ctx
, manager
, config
, Filesystem ( ctx
, fs
[ 'id' ]), fs
[ 'mdsmap' ][ 'max_mds' ])
526 thrashers
. append ( thrasher
)
529 log
. debug ( 'Yielding' )
532 log
. info ( 'joining mds_thrashers' )
533 for thrasher
in thrashers
:
536 raise RuntimeError ( 'error during thrashing' )
538 log
. info ( 'done joining' )