]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | Task for thrashing rbd-mirror daemons | |
3 | """ | |
4 | ||
5 | import contextlib | |
6 | import logging | |
7 | import random | |
8 | import signal | |
9 | import socket | |
10 | import time | |
11 | ||
12 | from gevent import sleep | |
13 | from gevent.greenlet import Greenlet | |
14 | from gevent.event import Event | |
15 | ||
11fdf7f2 | 16 | from teuthology.exceptions import CommandFailedError |
11fdf7f2 | 17 | from teuthology.orchestra import run |
9f95a23c | 18 | from tasks.thrasher import Thrasher |
11fdf7f2 TL |
19 | |
20 | log = logging.getLogger(__name__) | |
21 | ||
22 | ||
9f95a23c | 23 | class RBDMirrorThrasher(Thrasher, Greenlet): |
11fdf7f2 TL |
24 | """ |
25 | RBDMirrorThrasher:: | |
26 | ||
27 | The RBDMirrorThrasher thrashes rbd-mirror daemons during execution of other | |
28 | tasks (workunits, etc). | |
29 | ||
30 | The config is optional. Many of the config parameters are a maximum value | |
31 | to use when selecting a random value from a range. The config is a dict | |
32 | containing some or all of: | |
33 | ||
34 | cluster: [default: ceph] cluster to thrash | |
35 | ||
36 | max_thrash: [default: 1] the maximum number of active rbd-mirror daemons per | |
37 | cluster will be thrashed at any given time. | |
38 | ||
39 | min_thrash_delay: [default: 60] minimum number of seconds to delay before | |
40 | thrashing again. | |
41 | ||
42 | max_thrash_delay: [default: 120] maximum number of seconds to delay before | |
43 | thrashing again. | |
44 | ||
45 | max_revive_delay: [default: 10] maximum number of seconds to delay before | |
46 | bringing back a thrashed rbd-mirror daemon. | |
47 | ||
48 | randomize: [default: true] enables randomization and use the max/min values | |
49 | ||
50 | seed: [no default] seed the random number generator | |
51 | ||
52 | Examples:: | |
53 | ||
54 | The following example disables randomization, and uses the max delay | |
55 | values: | |
56 | ||
57 | tasks: | |
58 | - ceph: | |
59 | - rbd_mirror_thrash: | |
60 | randomize: False | |
61 | max_thrash_delay: 10 | |
62 | """ | |
63 | ||
64 | def __init__(self, ctx, config, cluster, daemons): | |
9f95a23c | 65 | super(RBDMirrorThrasher, self).__init__() |
11fdf7f2 TL |
66 | |
67 | self.ctx = ctx | |
68 | self.config = config | |
69 | self.cluster = cluster | |
70 | self.daemons = daemons | |
71 | ||
11fdf7f2 TL |
72 | self.logger = log |
73 | self.name = 'thrasher.rbd_mirror.[{cluster}]'.format(cluster = cluster) | |
74 | self.stopping = Event() | |
75 | ||
76 | self.randomize = bool(self.config.get('randomize', True)) | |
77 | self.max_thrash = int(self.config.get('max_thrash', 1)) | |
78 | self.min_thrash_delay = float(self.config.get('min_thrash_delay', 60.0)) | |
79 | self.max_thrash_delay = float(self.config.get('max_thrash_delay', 120.0)) | |
80 | self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) | |
81 | ||
82 | def _run(self): | |
83 | try: | |
84 | self.do_thrash() | |
85 | except Exception as e: | |
9f95a23c TL |
86 | # See _run exception comment for MDSThrasher |
87 | self.set_thrasher_exception(e) | |
11fdf7f2 | 88 | self.logger.exception("exception:") |
9f95a23c TL |
89 | # Allow successful completion so gevent doesn't see an exception. |
90 | # The DaemonWatchdog will observe the error and tear down the test. | |
11fdf7f2 TL |
91 | |
92 | def log(self, x): | |
93 | """Write data to logger assigned to this RBDMirrorThrasher""" | |
94 | self.logger.info(x) | |
95 | ||
96 | def stop(self): | |
97 | self.stopping.set() | |
98 | ||
99 | def do_thrash(self): | |
100 | """ | |
101 | Perform the random thrashing action | |
102 | """ | |
103 | ||
104 | self.log('starting thrash for cluster {cluster}'.format(cluster=self.cluster)) | |
105 | stats = { | |
106 | "kill": 0, | |
107 | } | |
108 | ||
109 | while not self.stopping.is_set(): | |
110 | delay = self.max_thrash_delay | |
111 | if self.randomize: | |
112 | delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay) | |
113 | ||
114 | if delay > 0.0: | |
115 | self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) | |
116 | self.stopping.wait(delay) | |
117 | if self.stopping.is_set(): | |
118 | continue | |
119 | ||
120 | killed_daemons = [] | |
121 | ||
122 | weight = 1.0 / len(self.daemons) | |
123 | count = 0 | |
124 | for daemon in self.daemons: | |
125 | skip = random.uniform(0.0, 1.0) | |
126 | if weight <= skip: | |
127 | self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format( | |
128 | label=daemon.id_, skip=skip, weight=weight)) | |
129 | continue | |
130 | ||
131 | self.log('kill {label}'.format(label=daemon.id_)) | |
132 | try: | |
133 | daemon.signal(signal.SIGTERM) | |
134 | except socket.error: | |
135 | pass | |
136 | killed_daemons.append(daemon) | |
137 | stats['kill'] += 1 | |
138 | ||
139 | # if we've reached max_thrash, we're done | |
140 | count += 1 | |
141 | if count >= self.max_thrash: | |
142 | break | |
143 | ||
144 | if killed_daemons: | |
145 | # wait for a while before restarting | |
146 | delay = self.max_revive_delay | |
147 | if self.randomize: | |
148 | delay = random.randrange(0.0, self.max_revive_delay) | |
149 | ||
150 | self.log('waiting for {delay} secs before reviving daemons'.format(delay=delay)) | |
151 | sleep(delay) | |
152 | ||
153 | for daemon in killed_daemons: | |
154 | self.log('waiting for {label}'.format(label=daemon.id_)) | |
155 | try: | |
156 | run.wait([daemon.proc], timeout=600) | |
157 | except CommandFailedError: | |
158 | pass | |
159 | except: | |
160 | self.log('Failed to stop {label}'.format(label=daemon.id_)) | |
161 | ||
162 | try: | |
163 | # try to capture a core dump | |
164 | daemon.signal(signal.SIGABRT) | |
165 | except socket.error: | |
166 | pass | |
167 | raise | |
168 | finally: | |
169 | daemon.reset() | |
170 | ||
171 | for daemon in killed_daemons: | |
172 | self.log('reviving {label}'.format(label=daemon.id_)) | |
173 | daemon.start() | |
174 | ||
175 | for stat in stats: | |
176 | self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat])) | |
177 | ||
178 | @contextlib.contextmanager | |
179 | def task(ctx, config): | |
180 | """ | |
181 | Stress test the rbd-mirror by thrashing while another task/workunit | |
182 | is running. | |
183 | ||
184 | Please refer to RBDMirrorThrasher class for further information on the | |
185 | available options. | |
186 | """ | |
187 | if config is None: | |
188 | config = {} | |
189 | assert isinstance(config, dict), \ | |
190 | 'rbd_mirror_thrash task only accepts a dict for configuration' | |
191 | ||
192 | cluster = config.get('cluster', 'ceph') | |
193 | daemons = list(ctx.daemons.iter_daemons_of_role('rbd-mirror', cluster)) | |
194 | assert len(daemons) > 0, \ | |
195 | 'rbd_mirror_thrash task requires at least 1 rbd-mirror daemon' | |
196 | ||
197 | # choose random seed | |
198 | if 'seed' in config: | |
199 | seed = int(config['seed']) | |
200 | else: | |
201 | seed = int(time.time()) | |
202 | log.info('rbd_mirror_thrash using random seed: {seed}'.format(seed=seed)) | |
203 | random.seed(seed) | |
204 | ||
205 | thrasher = RBDMirrorThrasher(ctx, config, cluster, daemons) | |
206 | thrasher.start() | |
9f95a23c | 207 | ctx.ceph[cluster].thrashers.append(thrasher) |
11fdf7f2 TL |
208 | |
209 | try: | |
210 | log.debug('Yielding') | |
211 | yield | |
212 | finally: | |
213 | log.info('joining rbd_mirror_thrash') | |
214 | thrasher.stop() | |
9f95a23c | 215 | if thrasher.exception is not None: |
11fdf7f2 TL |
216 | raise RuntimeError('error during thrashing') |
217 | thrasher.join() | |
218 | log.info('done joining') |