]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mds_thrash.py
2 Thrash mds by simulating failures
12 from gevent
import sleep
13 from gevent
. greenlet
import Greenlet
14 from gevent
. event
import Event
15 from teuthology
import misc
as teuthology
17 from tasks
. cephfs
. filesystem
import MDSCluster
, Filesystem
19 log
= logging
. getLogger ( __name__
)
21 class DaemonWatchdog ( Greenlet
):
25 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
26 not intentional), then the watchdog will unmount file systems and send
27 SIGTERM to all daemons. The duration of an extended failure is configurable
28 with watchdog_daemon_timeout.
30 watchdog_daemon_timeout [default: 300]: number of seconds a daemon
31 is allowed to be failed before the watchdog will bark.
34 def __init__ ( self
, ctx
, manager
, config
, thrashers
):
35 Greenlet
.__ init
__ ( self
)
39 self
. logger
= log
. getChild ( 'daemon_watchdog' )
40 self
. manager
= manager
41 self
. name
= 'watchdog'
42 self
. stopping
= Event ()
43 self
. thrashers
= thrashers
48 except Exception as e
:
49 # See _run exception comment for MDSThrasher
51 self
. logger
. exception ( "exception:" )
52 # allow successful completion so gevent doesn't see an exception...
55 """Write data to logger"""
62 self
. log ( "BARK! unmounting mounts and killing all daemons" )
63 for mount
in self
. ctx
. mounts
. values ():
65 mount
. umount_wait ( force
= True )
67 self
. logger
. exception ( "ignoring exception:" )
69 daemons
. extend ( filter ( lambda daemon
: daemon
. running () and not daemon
. proc
. finished
, self
. ctx
. daemons
. iter_daemons_of_role ( 'mds' , cluster
= self
. manager
. cluster
)))
70 daemons
. extend ( filter ( lambda daemon
: daemon
. running () and not daemon
. proc
. finished
, self
. ctx
. daemons
. iter_daemons_of_role ( 'mon' , cluster
= self
. manager
. cluster
)))
71 for daemon
in daemons
:
73 daemon
. signal ( signal
. SIGTERM
)
75 self
. logger
. exception ( "ignoring exception:" )
78 self
. log ( "watchdog starting" )
79 daemon_timeout
= int ( self
. config
. get ( 'watchdog_daemon_timeout' , 300 ))
80 daemon_failure_time
= {}
81 while not self
. stopping
. is_set ():
85 mons
= self
. ctx
. daemons
. iter_daemons_of_role ( 'mon' , cluster
= self
. manager
. cluster
)
86 mdss
= self
. ctx
. daemons
. iter_daemons_of_role ( 'mds' , cluster
= self
. manager
. cluster
)
87 clients
= self
. ctx
. daemons
. iter_daemons_of_role ( 'client' , cluster
= self
. manager
. cluster
)
90 # self.log("mon daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
92 # self.log("mds daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
95 daemon_failures
. extend ( filter ( lambda daemon
: daemon
. running () and daemon
. proc
. finished
, mons
))
96 daemon_failures
. extend ( filter ( lambda daemon
: daemon
. running () and daemon
. proc
. finished
, mdss
))
97 for daemon
in daemon_failures
:
98 name
= daemon
. role
+ '.' + daemon
. id_
99 dt
= daemon_failure_time
. setdefault ( name
, ( daemon
, now
))
100 assert dt
[ 0 ] is daemon
102 self
. log ( "daemon {name} is failed for ~{t:.0f}s" . format ( name
= name
, t
= delta
))
103 if delta
> daemon_timeout
:
106 # If a daemon is no longer failed, remove it from tracking:
107 for name
in daemon_failure_time
. keys ():
108 if name
not in [ d
. role
+ '.' + d
. id_
for d
in daemon_failures
]:
109 self
. log ( "daemon {name} has been restored" . format ( name
= name
))
110 del daemon_failure_time
[ name
]
112 for thrasher
in self
. thrashers
:
113 if thrasher
. e
is not None :
114 self
. log ( "thrasher on fs. {name} failed" . format ( name
= thrasher
. fs
. name
))
123 self
. log ( "watchdog finished" )
125 class MDSThrasher ( Greenlet
):
129 The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
131 The config is optional. Many of the config parameters are a a maximum value
132 to use when selecting a random value from a range. To always use the maximum
133 value, set no_random to true. The config is a dict containing some or all of:
135 max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
138 max_thrash_delay: [default: 30] maximum number of seconds to delay before
141 max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
142 the replay state before thrashing.
144 max_revive_delay: [default: 10] maximum number of seconds to delay before
145 bringing back a thrashed MDS.
147 randomize: [default: true] enables randomization and use the max/min values
149 seed: [no default] seed the random number generator
151 thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
152 during replay. Value should be between 0.0 and 1.0.
154 thrash_max_mds: [default: 0.0] likelihood that the max_mds of the mds
155 cluster will be modified to a value [1, current) or (current, starting
156 max_mds]. When reduced, randomly selected MDSs other than rank 0 will be
157 deactivated to reach the new max_mds. Value should be between 0.0 and 1.0.
159 thrash_while_stopping: [default: false] thrash an MDS while there
160 are MDS in up:stopping (because max_mds was changed and some
161 MDS were deactivated).
163 thrash_weights: allows specific MDSs to be thrashed more/less frequently.
164 This option overrides anything specified by max_thrash. This option is a
165 dict containing mds.x: weight pairs. For example, [mds.a: 0.7, mds.b:
166 0.3, mds.c: 0.0]. Each weight is a value from 0.0 to 1.0. Any MDSs not
167 specified will be automatically given a weight of 0.0 (not thrashed).
168 For a given MDS, by default the trasher delays for up to
169 max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
170 If a non-zero weight is specified for an MDS, for each iteration the
171 thrasher chooses whether to thrash during that iteration based on a
172 random value [0-1] not exceeding the weight of that MDS.
177 The following example sets the likelihood that mds.a will be thrashed
178 to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the
179 likelihood that an MDS will be thrashed in replay to 40%.
180 Thrash weights do not have to sum to 1.
188 thrash_in_replay: 0.4
194 The following example disables randomization, and uses the max delay values:
201 max_replay_thrash_delay: 4
205 def __init__ ( self
, ctx
, manager
, config
, fs
, max_mds
):
206 Greenlet
.__ init
__ ( self
)
211 self
. logger
= log
. getChild ( 'fs.[ {f} ]' . format ( f
= fs
. name
))
213 self
. manager
= manager
214 self
. max_mds
= max_mds
215 self
. name
= 'thrasher.fs.[ {f} ]' . format ( f
= fs
. name
)
216 self
. stopping
= Event ()
218 self
. randomize
= bool ( self
. config
. get ( 'randomize' , True ))
219 self
. thrash_max_mds
= float ( self
. config
. get ( 'thrash_max_mds' , 0.0 ))
220 self
. max_thrash
= int ( self
. config
. get ( 'max_thrash' , 1 ))
221 self
. max_thrash_delay
= float ( self
. config
. get ( 'thrash_delay' , 120.0 ))
222 self
. thrash_in_replay
= float ( self
. config
. get ( 'thrash_in_replay' , False ))
223 assert self
. thrash_in_replay
>= 0.0 and self
. thrash_in_replay
<= 1.0 , 'thrash_in_replay ( {v} ) must be between [0.0, 1.0]' . format (
224 v
= self
. thrash_in_replay
)
225 self
. max_replay_thrash_delay
= float ( self
. config
. get ( 'max_replay_thrash_delay' , 4.0 ))
226 self
. max_revive_delay
= float ( self
. config
. get ( 'max_revive_delay' , 10.0 ))
231 except Exception as e
:
232 # Log exceptions here so we get the full backtrace (gevent loses them).
233 # Also allow succesful completion as gevent exception handling is a broken mess:
235 # 2017-02-03T14:34:01.259 CRITICAL:root: File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051)
236 # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error
237 # self.print_exception(context, type, value, tb)
238 # File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception
239 # traceback.print_exception(type, value, tb, file=errstream)
240 # File "/usr/lib/python2.7/traceback.py", line 124, in print_exception
241 # _print(file, 'Traceback (most recent call last):')
242 # File "/usr/lib/python2.7/traceback.py", line 13, in _print
243 # file.write(str+terminator)
244 # 2017-02-03T14:34:01.261 CRITICAL:root:IOError
246 self
. logger
. exception ( "exception:" )
247 # allow successful completion so gevent doesn't see an exception...
250 """Write data to logger assigned to this MDThrasher"""
256 def kill_mds ( self
, mds
):
257 if self
. config
. get ( 'powercycle' ):
258 ( remote
,) = ( self
. ctx
. cluster
. only ( 'mds. {m} ' . format ( m
= mds
)).
260 self
. log ( 'kill_mds on mds. {m} doing powercycle of {s} ' .
261 format ( m
= mds
, s
= remote
. name
))
262 self
._ assert
_ ipmi
( remote
)
263 remote
. console
. power_off ()
265 self
. ctx
. daemons
. get_daemon ( 'mds' , mds
). stop ()
268 def _assert_ipmi ( remote
):
269 assert remote
. console
. has_ipmi_credentials
, (
270 "powercycling requested but RemoteConsole is not "
271 "initialized. Check ipmi config." )
273 def revive_mds ( self
, mds
, standby_for_rank
= None ):
275 Revive mds -- do an ipmpi powercycle (if indicated by the config)
276 and then restart (using --hot-standby if specified.
278 if self
. config
. get ( 'powercycle' ):
279 ( remote
,) = ( self
. ctx
. cluster
. only ( 'mds. {m} ' . format ( m
= mds
)).
281 self
. log ( 'revive_mds on mds. {m} doing powercycle of {s} ' .
282 format ( m
= mds
, s
= remote
. name
))
283 self
._ assert
_ ipmi
( remote
)
284 remote
. console
. power_on ()
285 self
. manager
. make_admin_daemon_dir ( self
. ctx
, remote
)
288 args
. extend ([ '--hot-standby' , standby_for_rank
])
289 self
. ctx
. daemons
. get_daemon ( 'mds' , mds
). restart (* args
)
291 def wait_for_stable ( self
, rank
= None , gid
= None ):
292 self
. log ( 'waiting for mds cluster to stabilize...' )
293 for itercount
in itertools
. count ():
294 status
= self
. fs
. status ()
295 max_mds
= status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'max_mds' ]
296 ranks
= list ( status
. get_ranks ( self
. fs
. id ))
297 stopping
= filter ( lambda info
: "up:stopping" == info
[ 'state' ], ranks
)
298 actives
= filter ( lambda info
: "up:active" == info
[ 'state' ] and "laggy_since" not in info
, ranks
)
300 if not bool ( self
. config
. get ( 'thrash_while_stopping' , False )) and len ( stopping
) > 0 :
301 if itercount
% 5 == 0 :
302 self
. log ( 'cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)' )
306 info
= status
. get_rank ( self
. fs
. id , rank
)
307 if info
[ 'gid' ] != gid
and "up:active" == info
[ 'state' ]:
308 self
. log ( 'mds. {name} has gained rank= {rank} , replacing gid= {gid} ' . format ( name
= info
[ 'name' ], rank
= rank
, gid
= gid
))
311 pass # no rank present
312 if len ( actives
) >= max_mds
:
313 # no replacement can occur!
314 self
. log ( "cluster has %d actives (max_mds is %d ), no MDS can replace rank %d " . format ( len ( actives
), max_mds
, rank
))
317 if len ( actives
) >= max_mds
:
318 self
. log ( 'mds cluster has {count} alive and active, now stable!' . format ( count
= len ( actives
)))
320 if itercount
> 300 / 2 : # 5 minutes
321 raise RuntimeError ( 'timeout waiting for cluster to stabilize' )
322 elif itercount
% 5 == 0 :
323 self
. log ( 'mds map: {status} ' . format ( status
= status
))
325 self
. log ( 'no change' )
330 Perform the random thrashing action
333 self
. log ( 'starting mds_do_thrash for fs {fs} ' . format ( fs
= self
. fs
. name
))
340 while not self
. stopping
. is_set ():
341 delay
= self
. max_thrash_delay
343 delay
= random
. randrange ( 0.0 , self
. max_thrash_delay
)
346 self
. log ( 'waiting for {delay} secs before thrashing' . format ( delay
= delay
))
347 self
. stopping
. wait ( delay
)
348 if self
. stopping
. is_set ():
351 status
= self
. fs
. status ()
353 if random
. random () <= self
. thrash_max_mds
:
354 max_mds
= status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'max_mds' ]
355 options
= range ( 1 , max_mds
)+ range ( max_mds
+ 1 , self
. max_mds
+ 1 )
357 sample
= random
. sample ( options
, 1 )
358 new_max_mds
= sample
[ 0 ]
359 self
. log ( 'thrashing max_mds: %d -> %d ' % ( max_mds
, new_max_mds
))
360 self
. fs
. set_max_mds ( new_max_mds
)
361 stats
[ 'max_mds' ] += 1
363 # Now randomly deactivate mds if we shrank
364 # TODO: it's desirable to deactivate in order. Make config to do random.
365 targets
= filter ( lambda r
: r
[ 'rank' ] > 0 , status
. get_ranks ( self
. fs
. id )) # can't deactivate 0
366 for target
in random
. sample ( targets
, max ( 0 , max_mds
- new_max_mds
)):
367 self
. log ( "deactivating rank %d " % target
[ 'rank' ])
368 self
. fs
. deactivate ( target
[ 'rank' ])
369 stats
[ 'deactivate' ] += 1
371 status
= self
. wait_for_stable ()[ 0 ]
374 for info
in status
. get_ranks ( self
. fs
. id ):
376 label
= 'mds.' + name
380 # if thrash_weights isn't specified and we've reached max_thrash,
383 if 'thrash_weights' not in self
. config
and count
> self
. max_thrash
:
387 if 'thrash_weights' in self
. config
:
388 weight
= self
. config
[ 'thrash_weights' ]. get ( label
, '0.0' )
389 skip
= random
. randrange ( 0.0 , 1.0 )
391 self
. log ( 'skipping thrash iteration with skip ( {skip} ) > weight ( {weight} )' . format ( skip
= skip
, weight
= weight
))
394 self
. log ( 'kill {label} (rank= {rank} )' . format ( label
= label
, rank
= rank
))
398 # wait for mon to report killed mds as crashed
399 last_laggy_since
= None
402 status
= self
. fs
. status ()
403 info
= status
. get_mds ( name
)
406 if 'laggy_since' in info
:
407 last_laggy_since
= info
[ 'laggy_since' ]
409 if any ([( f
== name
) for f
in status
. get_fsmap ( self
. fs
. id )[ 'mdsmap' ][ 'failed' ]]):
412 'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap' . format (
414 itercount
= itercount
+ 1
416 self
. log ( 'mds map: {status} ' . format ( status
= status
))
421 ' {label} reported laggy/crashed since: {since} ' . format ( label
= label
, since
= last_laggy_since
))
423 self
. log ( ' {label} down, removed from mdsmap' . format ( label
= label
, since
= last_laggy_since
))
425 # wait for a standby mds to takeover and become active
426 status
= self
. wait_for_stable ( rank
, gid
)
428 # wait for a while before restarting old active to become new
430 delay
= self
. max_revive_delay
432 delay
= random
. randrange ( 0.0 , self
. max_revive_delay
)
434 self
. log ( 'waiting for {delay} secs before reviving {label} ' . format (
435 delay
= delay
, label
= label
))
438 self
. log ( 'reviving {label} ' . format ( label
= label
))
439 self
. revive_mds ( name
)
441 for itercount
in itertools
. count ():
442 if itercount
> 300 / 2 : # 5 minutes
443 raise RuntimeError ( 'timeout waiting for MDS to revive' )
444 status
= self
. fs
. status ()
445 info
= status
. get_mds ( name
)
446 if info
and info
[ 'state' ] in ( 'up:standby' , 'up:standby-replay' , 'up:active' ):
447 self
. log ( ' {label} reported in {state} state' . format ( label
= label
, state
= info
[ 'state' ]))
450 'waiting till mds map indicates {label} is in active, standby or standby-replay' . format ( label
= label
))
454 self
. log ( "stat[' {key} '] = {value} " . format ( key
= stat
, value
= stats
[ stat
]))
456 # don't do replay thrashing right now
457 # for info in status.get_replays(self.fs.id):
458 # # this might race with replay -> active transition...
459 # if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
460 # delay = self.max_replay_thrash_delay
462 # delay = random.randrange(0.0, self.max_replay_thrash_delay)
464 # self.log('kill replaying mds.{id}'.format(id=self.to_kill))
465 # self.kill_mds(self.to_kill)
467 # delay = self.max_revive_delay
469 # delay = random.randrange(0.0, self.max_revive_delay)
471 # self.log('waiting for {delay} secs before reviving mds.{id}'.format(
472 # delay=delay, id=self.to_kill))
475 # self.log('revive mds.{id}'.format(id=self.to_kill))
476 # self.revive_mds(self.to_kill)
479 @contextlib . contextmanager
480 def task ( ctx
, config
):
482 Stress test the mds by thrashing while another task/workunit
485 Please refer to MDSThrasher class for further information on the
489 mds_cluster
= MDSCluster ( ctx
)
493 assert isinstance ( config
, dict ), \
494 'mds_thrash task only accepts a dict for configuration'
495 mdslist
= list ( teuthology
. all_roles_of_type ( ctx
. cluster
, 'mds' ))
496 assert len ( mdslist
) > 1 , \
497 'mds_thrash task requires at least 2 metadata servers'
501 seed
= int ( config
[ 'seed' ])
503 seed
= int ( time
. time ())
504 log
. info ( 'mds thrasher using random seed: {seed} ' . format ( seed
= seed
))
507 ( first
,) = ctx
. cluster
. only ( 'mds. {_id} ' . format ( _id
= mdslist
[ 0 ])). remotes
. iterkeys ()
508 manager
= ceph_manager
. CephManager (
509 first
, ctx
= ctx
, logger
= log
. getChild ( 'ceph_manager' ),
512 # make sure everyone is in active, standby, or standby-replay
513 log
. info ( 'Wait for all MDSs to reach steady state...' )
514 status
= mds_cluster
. status ()
517 for info
in status
. get_all ():
518 state
= info
[ 'state' ]
519 if state
not in ( 'up:active' , 'up:standby' , 'up:standby-replay' ):
525 status
= mds_cluster
. status ()
526 log
. info ( 'Ready to start thrashing' )
530 watchdog
= DaemonWatchdog ( ctx
, manager
, config
, thrashers
)
533 manager
. wait_for_clean ()
534 assert manager
. is_clean ()
535 for fs
in status
. get_filesystems ():
536 thrasher
= MDSThrasher ( ctx
, manager
, config
, Filesystem ( ctx
, fs
[ 'id' ]), fs
[ 'mdsmap' ][ 'max_mds' ])
538 thrashers
. append ( thrasher
)
541 log
. debug ( 'Yielding' )
544 log
. info ( 'joining mds_thrashers' )
545 for thrasher
in thrashers
:
548 raise RuntimeError ( 'error during thrashing' )
550 log
. info ( 'done joining' )