]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | diskprediction with local predictor | |
3 | """ | |
4 | import json | |
5 | import datetime | |
6 | import _strptime | |
7 | from threading import Event | |
8 | import time | |
9 | ||
10 | from mgr_module import MgrModule, CommandResult | |
11 | ||
12 | ||
13 | TIME_FORMAT = '%Y%m%d-%H%M%S' | |
14 | TIME_DAYS = 24*60*60 | |
15 | TIME_WEEK = TIME_DAYS * 7 | |
16 | ||
17 | ||
18 | class Module(MgrModule): | |
19 | MODULE_OPTIONS = [ | |
20 | { | |
21 | 'name': 'sleep_interval', | |
22 | 'default': str(600), | |
23 | }, | |
24 | { | |
25 | 'name': 'predict_interval', | |
26 | 'default': str(86400), | |
27 | }, | |
28 | ] | |
29 | ||
30 | COMMANDS = [] | |
31 | ||
32 | def __init__(self, *args, **kwargs): | |
33 | super(Module, self).__init__(*args, **kwargs) | |
34 | # options | |
35 | for opt in self.MODULE_OPTIONS: | |
36 | setattr(self, opt['name'], opt['default']) | |
37 | # other | |
38 | self._run = True | |
39 | self._event = Event() | |
40 | ||
41 | def config_notify(self): | |
42 | for opt in self.MODULE_OPTIONS: | |
43 | setattr(self, | |
44 | opt['name'], | |
45 | self.get_module_option(opt['name']) or opt['default']) | |
46 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) | |
47 | if self.get_ceph_option('device_failure_prediction_mode') == 'local': | |
48 | self._event.set() | |
49 | ||
50 | def refresh_config(self): | |
51 | for opt in self.MODULE_OPTIONS: | |
52 | setattr(self, | |
53 | opt['name'], | |
54 | self.get_module_option(opt['name']) or opt['default']) | |
55 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) | |
56 | ||
57 | def handle_command(self, _, cmd): | |
58 | self.log.debug('handle_command cmd: %s', cmd) | |
59 | raise NotImplementedError(cmd['prefix']) | |
60 | ||
61 | def self_test(self): | |
62 | ret, out, err = self.predict_all_devices() | |
63 | assert ret == 0 | |
64 | return 0, 'self test succeed', '' | |
65 | ||
66 | def serve(self): | |
67 | self.log.info('Starting diskprediction local module') | |
68 | self.config_notify() | |
69 | last_predicted = None | |
70 | ls = self.get_store('last_predicted') | |
71 | if ls: | |
72 | try: | |
73 | last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT) | |
74 | except ValueError: | |
75 | pass | |
76 | self.log.debug('Last predicted %s', last_predicted) | |
77 | ||
78 | while self._run: | |
79 | self.refresh_config() | |
80 | mode = self.get_ceph_option('device_failure_prediction_mode') | |
81 | if mode == 'local': | |
82 | now = datetime.datetime.utcnow() | |
83 | if not last_predicted: | |
84 | next_predicted = now | |
85 | else: | |
86 | predicted_frequency = int(self.predict_interval) or 86400 | |
87 | seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds() | |
88 | seconds -= seconds % predicted_frequency | |
89 | seconds += predicted_frequency | |
90 | next_predicted = datetime.datetime.utcfromtimestamp(seconds) | |
91 | if last_predicted: | |
92 | self.log.debug('Last scrape %s, next scrape due %s', | |
93 | last_predicted.strftime(TIME_FORMAT), | |
94 | next_predicted.strftime(TIME_FORMAT)) | |
95 | else: | |
96 | self.log.debug('Last scrape never, next scrape due %s', | |
97 | next_predicted.strftime(TIME_FORMAT)) | |
98 | if now >= next_predicted: | |
99 | self.predict_all_devices() | |
100 | last_predicted = now | |
101 | self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT)) | |
102 | ||
103 | sleep_interval = int(self.sleep_interval) or 60 | |
104 | self.log.debug('Sleeping for %d seconds', sleep_interval) | |
105 | self._event.wait(sleep_interval) | |
106 | self._event.clear() | |
107 | ||
108 | def shutdown(self): | |
109 | self.log.info('Stopping') | |
110 | self._run = False | |
111 | self._event.set() | |
112 | ||
113 | @staticmethod | |
114 | def _convert_timestamp(predicted_timestamp, life_expectancy_day): | |
115 | """ | |
116 | :param predicted_timestamp: unit is nanoseconds | |
117 | :param life_expectancy_day: unit is seconds | |
118 | :return: | |
119 | date format '%Y-%m-%d' ex. 2018-01-01 | |
120 | """ | |
121 | return datetime.datetime.fromtimestamp( | |
122 | predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d') | |
123 | ||
124 | def _predict_life_expentancy(self, devid): | |
125 | predicted_result = '' | |
126 | from .predictor import get_diskfailurepredictor_path, DiskFailurePredictor | |
127 | health_data = {} | |
128 | predict_datas = [] | |
129 | try: | |
130 | r, outb, outs = self.remote('devicehealth', 'show_device_metrics', devid=devid, sample='') | |
131 | if r != 0: | |
132 | self.log.error('failed to get device %s health', devid) | |
133 | health_data = {} | |
134 | else: | |
135 | health_data = json.loads(outb) | |
136 | except Exception as e: | |
137 | self.log.error('failed to get device %s health data due to %s', devid, str(e)) | |
138 | ||
139 | obj_predictor = DiskFailurePredictor() | |
140 | obj_predictor.initialize("{}/models".format(get_diskfailurepredictor_path())) | |
141 | ||
142 | if len(health_data) >= 6: | |
143 | o_keys = sorted(health_data.keys(), reverse=True) | |
144 | for o_key in o_keys: | |
145 | dev_smart = {} | |
146 | s_val = health_data[o_key] | |
147 | ata_smart = s_val.get('ata_smart_attributes', {}) | |
148 | for attr in ata_smart.get('table', []): | |
149 | if attr.get('raw', {}).get('string'): | |
150 | if str(attr.get('raw', {}).get('string', '0')).isdigit(): | |
151 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
152 | int(attr.get('raw', {}).get('string', '0')) | |
153 | else: | |
154 | if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit(): | |
155 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
156 | int(attr.get('raw', {}).get('string', | |
157 | '0').split(' ')[0]) | |
158 | else: | |
159 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
160 | attr.get('raw', {}).get('value', 0) | |
161 | if s_val.get('power_on_time', {}).get('hours') is not None: | |
162 | dev_smart['smart_9_raw'] = int(s_val['power_on_time']['hours']) | |
163 | if dev_smart: | |
164 | predict_datas.append(dev_smart) | |
165 | if len(predict_datas) >= 12: | |
166 | break | |
167 | else: | |
168 | self.log.error('unable to predict device due to health data records less than 6 days') | |
169 | ||
170 | if predict_datas: | |
171 | predicted_result = obj_predictor.predict(predict_datas) | |
172 | return predicted_result | |
173 | ||
174 | def predict_life_expectancy(self, devid): | |
175 | result = self._predict_life_expentancy(devid) | |
176 | if result.lower() == 'good': | |
177 | return 0, '>6w', '' | |
178 | elif result.lower() == 'warning': | |
179 | return 0, '>=2w and <=6w', '' | |
180 | elif result.lower() == 'bad': | |
181 | return 0, '<2w', '' | |
182 | else: | |
183 | return 0, 'unknown', '' | |
184 | ||
185 | def _reset_device_life_expectancy(self, device_id): | |
186 | result = CommandResult('') | |
187 | self.send_command(result, 'mon', '', json.dumps({ | |
188 | 'prefix': 'device rm-life-expectancy', | |
189 | 'devid': device_id | |
190 | }), '') | |
191 | ret, _, outs = result.wait() | |
192 | if ret != 0: | |
193 | self.log.error( | |
194 | 'failed to reset device life expectancy, %s' % outs) | |
195 | return ret | |
196 | ||
197 | def _set_device_life_expectancy(self, device_id, from_date, to_date=None): | |
198 | result = CommandResult('') | |
199 | ||
200 | if to_date is None: | |
201 | self.send_command(result, 'mon', '', json.dumps({ | |
202 | 'prefix': 'device set-life-expectancy', | |
203 | 'devid': device_id, | |
204 | 'from': from_date | |
205 | }), '') | |
206 | else: | |
207 | self.send_command(result, 'mon', '', json.dumps({ | |
208 | 'prefix': 'device set-life-expectancy', | |
209 | 'devid': device_id, | |
210 | 'from': from_date, | |
211 | 'to': to_date | |
212 | }), '') | |
213 | ret, _, outs = result.wait() | |
214 | if ret != 0: | |
215 | self.log.error( | |
216 | 'failed to set device life expectancy, %s' % outs) | |
217 | return ret | |
218 | ||
219 | def predict_all_devices(self): | |
220 | devices = self.get('devices').get('devices', []) | |
221 | for devInfo in devices: | |
222 | if not devInfo.get('daemons'): | |
223 | continue | |
224 | if not devInfo.get('devid'): | |
225 | continue | |
226 | result = self._predict_life_expentancy(devInfo['devid']) | |
227 | if result == 'unknown': | |
228 | self._reset_device_life_expectancy(devInfo['devid']) | |
229 | continue | |
230 | predicted = int(time.time() * (1000 ** 3)) | |
231 | ||
232 | if result.lower() == 'good': | |
233 | life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS | |
234 | life_expectancy_day_max = None | |
235 | elif result.lower() == 'warning': | |
236 | life_expectancy_day_min = (TIME_WEEK * 2) | |
237 | life_expectancy_day_max = (TIME_WEEK * 6) | |
238 | elif result.lower() == 'bad': | |
239 | life_expectancy_day_min = 0 | |
240 | life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS | |
241 | else: | |
242 | predicted = None | |
243 | life_expectancy_day_min = None | |
244 | life_expectancy_day_max = None | |
245 | ||
246 | if predicted and devInfo['devid'] and life_expectancy_day_min: | |
247 | from_date = None | |
248 | to_date = None | |
249 | try: | |
250 | if life_expectancy_day_min: | |
251 | from_date = self._convert_timestamp(predicted, life_expectancy_day_min) | |
252 | ||
253 | if life_expectancy_day_max: | |
254 | to_date = self._convert_timestamp(predicted, life_expectancy_day_max) | |
255 | ||
256 | self._set_device_life_expectancy(devInfo['devid'], from_date, to_date) | |
257 | self._logger.info( | |
258 | 'succeed to set device {} life expectancy from: {}, to: {}'.format( | |
259 | devInfo['devid'], from_date, to_date)) | |
260 | except Exception as e: | |
261 | self._logger.error( | |
262 | 'failed to set device {} life expectancy from: {}, to: {}, {}'.format( | |
263 | devInfo['devid'], from_date, to_date, str(e))) | |
264 | else: | |
265 | self._reset_device_life_expectancy(devInfo['devid']) | |
266 | return 0, 'succeed to predicted all devices', '' |