]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | """ |
2 | diskprediction with local predictor | |
3 | """ | |
4 | import json | |
5 | import datetime | |
11fdf7f2 TL |
6 | from threading import Event |
7 | import time | |
20effc67 TL |
8 | from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING |
9 | from mgr_module import CommandResult, MgrModule, Option | |
9f95a23c TL |
10 | # Importing scipy early appears to avoid a future deadlock when |
11 | # we try to do | |
12 | # | |
13 | # from .predictor import get_diskfailurepredictor_path | |
14 | # | |
15 | # in a command thread. See https://tracker.ceph.com/issues/42764 | |
20effc67 TL |
16 | import scipy # noqa: ignore=F401 |
17 | from .predictor import DevSmartT, Predictor, get_diskfailurepredictor_path | |
9f95a23c | 18 | |
11fdf7f2 TL |
19 | |
20 | TIME_FORMAT = '%Y%m%d-%H%M%S' | |
20effc67 | 21 | TIME_DAYS = 24 * 60 * 60 |
11fdf7f2 TL |
22 | TIME_WEEK = TIME_DAYS * 7 |
23 | ||
24 | ||
25 | class Module(MgrModule): | |
26 | MODULE_OPTIONS = [ | |
20effc67 TL |
27 | Option(name='sleep_interval', |
28 | default=600), | |
29 | Option(name='predict_interval', | |
30 | default=86400), | |
31 | Option(name='predictor_model', | |
32 | default='prophetstor') | |
11fdf7f2 TL |
33 | ] |
34 | ||
20effc67 | 35 | def __init__(self, *args: Any, **kwargs: Any) -> None: |
11fdf7f2 TL |
36 | super(Module, self).__init__(*args, **kwargs) |
37 | # options | |
38 | for opt in self.MODULE_OPTIONS: | |
39 | setattr(self, opt['name'], opt['default']) | |
40 | # other | |
41 | self._run = True | |
42 | self._event = Event() | |
20effc67 TL |
43 | # for mypy which does not run the code |
44 | if TYPE_CHECKING: | |
45 | self.sleep_interval = 0 | |
46 | self.predict_interval = 0 | |
47 | self.predictor_model = '' | |
11fdf7f2 | 48 | |
20effc67 | 49 | def config_notify(self) -> None: |
11fdf7f2 TL |
50 | for opt in self.MODULE_OPTIONS: |
51 | setattr(self, | |
52 | opt['name'], | |
9f95a23c | 53 | self.get_module_option(opt['name'])) |
11fdf7f2 TL |
54 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) |
55 | if self.get_ceph_option('device_failure_prediction_mode') == 'local': | |
56 | self._event.set() | |
57 | ||
20effc67 | 58 | def refresh_config(self) -> None: |
11fdf7f2 TL |
59 | for opt in self.MODULE_OPTIONS: |
60 | setattr(self, | |
61 | opt['name'], | |
801d1391 | 62 | self.get_module_option(opt['name'])) |
11fdf7f2 TL |
63 | self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) |
64 | ||
20effc67 | 65 | def self_test(self) -> None: |
9f95a23c | 66 | self.log.debug('self_test enter') |
11fdf7f2 TL |
67 | ret, out, err = self.predict_all_devices() |
68 | assert ret == 0 | |
11fdf7f2 | 69 | |
20effc67 | 70 | def serve(self) -> None: |
11fdf7f2 TL |
71 | self.log.info('Starting diskprediction local module') |
72 | self.config_notify() | |
73 | last_predicted = None | |
74 | ls = self.get_store('last_predicted') | |
75 | if ls: | |
76 | try: | |
77 | last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT) | |
78 | except ValueError: | |
79 | pass | |
80 | self.log.debug('Last predicted %s', last_predicted) | |
81 | ||
82 | while self._run: | |
83 | self.refresh_config() | |
84 | mode = self.get_ceph_option('device_failure_prediction_mode') | |
85 | if mode == 'local': | |
86 | now = datetime.datetime.utcnow() | |
87 | if not last_predicted: | |
88 | next_predicted = now | |
89 | else: | |
20effc67 | 90 | predicted_frequency = self.predict_interval or 86400 |
11fdf7f2 TL |
91 | seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds() |
92 | seconds -= seconds % predicted_frequency | |
93 | seconds += predicted_frequency | |
94 | next_predicted = datetime.datetime.utcfromtimestamp(seconds) | |
20effc67 TL |
95 | self.log.debug('Last scrape %s, next scrape due %s', |
96 | last_predicted.strftime(TIME_FORMAT), | |
97 | next_predicted.strftime(TIME_FORMAT)) | |
11fdf7f2 TL |
98 | if now >= next_predicted: |
99 | self.predict_all_devices() | |
100 | last_predicted = now | |
101 | self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT)) | |
102 | ||
20effc67 | 103 | sleep_interval = self.sleep_interval or 60 |
11fdf7f2 TL |
104 | self.log.debug('Sleeping for %d seconds', sleep_interval) |
105 | self._event.wait(sleep_interval) | |
106 | self._event.clear() | |
107 | ||
20effc67 | 108 | def shutdown(self) -> None: |
11fdf7f2 TL |
109 | self.log.info('Stopping') |
110 | self._run = False | |
111 | self._event.set() | |
112 | ||
113 | @staticmethod | |
20effc67 | 114 | def _convert_timestamp(predicted_timestamp: int, life_expectancy_day: int) -> str: |
11fdf7f2 TL |
115 | """ |
116 | :param predicted_timestamp: unit is nanoseconds | |
117 | :param life_expectancy_day: unit is seconds | |
118 | :return: | |
119 | date format '%Y-%m-%d' ex. 2018-01-01 | |
120 | """ | |
121 | return datetime.datetime.fromtimestamp( | |
122 | predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d') | |
123 | ||
20effc67 | 124 | def _predict_life_expectancy(self, devid: str) -> str: |
11fdf7f2 | 125 | predicted_result = '' |
20effc67 TL |
126 | health_data: Dict[str, Dict[str, Any]] = {} |
127 | predict_datas: List[DevSmartT] = [] | |
11fdf7f2 | 128 | try: |
20effc67 TL |
129 | r, outb, outs = self.remote( |
130 | 'devicehealth', 'show_device_metrics', devid=devid, sample='') | |
11fdf7f2 TL |
131 | if r != 0: |
132 | self.log.error('failed to get device %s health', devid) | |
133 | health_data = {} | |
134 | else: | |
135 | health_data = json.loads(outb) | |
136 | except Exception as e: | |
137 | self.log.error('failed to get device %s health data due to %s', devid, str(e)) | |
138 | ||
9f95a23c | 139 | # initialize appropriate disk failure predictor model |
20effc67 TL |
140 | obj_predictor = Predictor.create(self.predictor_model) |
141 | if obj_predictor is None: | |
9f95a23c TL |
142 | self.log.error('invalid value received for MODULE_OPTIONS.predictor_model') |
143 | return predicted_result | |
20effc67 TL |
144 | try: |
145 | obj_predictor.initialize( | |
146 | "{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model)) | |
147 | except Exception as e: | |
148 | self.log.error('Error initializing predictor: %s', e) | |
149 | return predicted_result | |
11fdf7f2 TL |
150 | |
151 | if len(health_data) >= 6: | |
152 | o_keys = sorted(health_data.keys(), reverse=True) | |
153 | for o_key in o_keys: | |
9f95a23c | 154 | # get values for current day (?) |
11fdf7f2 TL |
155 | dev_smart = {} |
156 | s_val = health_data[o_key] | |
9f95a23c TL |
157 | |
158 | # add all smart attributes | |
11fdf7f2 TL |
159 | ata_smart = s_val.get('ata_smart_attributes', {}) |
160 | for attr in ata_smart.get('table', []): | |
9f95a23c TL |
161 | # get raw smart values |
162 | if attr.get('raw', {}).get('string') is not None: | |
11fdf7f2 TL |
163 | if str(attr.get('raw', {}).get('string', '0')).isdigit(): |
164 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
165 | int(attr.get('raw', {}).get('string', '0')) | |
166 | else: | |
167 | if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit(): | |
168 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
169 | int(attr.get('raw', {}).get('string', | |
170 | '0').split(' ')[0]) | |
171 | else: | |
172 | dev_smart['smart_%s_raw' % attr.get('id')] = \ | |
173 | attr.get('raw', {}).get('value', 0) | |
9f95a23c TL |
174 | # get normalized smart values |
175 | if attr.get('value') is not None: | |
176 | dev_smart['smart_%s_normalized' % attr.get('id')] = \ | |
20effc67 | 177 | attr.get('value') |
9f95a23c | 178 | # add power on hours manually if not available in smart attributes |
20effc67 TL |
179 | power_on_time = s_val.get('power_on_time', {}).get('hours') |
180 | if power_on_time is not None: | |
181 | dev_smart['smart_9_raw'] = int(power_on_time) | |
9f95a23c | 182 | # add device capacity |
20effc67 TL |
183 | user_capacity = s_val.get('user_capacity', {}).get('bytes') |
184 | if user_capacity is not None: | |
185 | dev_smart['user_capacity'] = user_capacity | |
186 | else: | |
187 | self.log.debug('user_capacity not found in smart attributes list') | |
9f95a23c | 188 | # add device model |
20effc67 TL |
189 | model_name = s_val.get('model_name') |
190 | if model_name is not None: | |
191 | dev_smart['model_name'] = model_name | |
9f95a23c | 192 | # add vendor |
20effc67 TL |
193 | vendor = s_val.get('vendor') |
194 | if vendor is not None: | |
195 | dev_smart['vendor'] = vendor | |
9f95a23c | 196 | # if smart data was found, then add that to list |
11fdf7f2 TL |
197 | if dev_smart: |
198 | predict_datas.append(dev_smart) | |
199 | if len(predict_datas) >= 12: | |
200 | break | |
201 | else: | |
202 | self.log.error('unable to predict device due to health data records less than 6 days') | |
203 | ||
f6b5b4d7 | 204 | if len(predict_datas) >= 6: |
11fdf7f2 TL |
205 | predicted_result = obj_predictor.predict(predict_datas) |
206 | return predicted_result | |
207 | ||
20effc67 TL |
208 | def predict_life_expectancy(self, devid: str) -> Tuple[int, str, str]: |
209 | result = self._predict_life_expectancy(devid) | |
11fdf7f2 TL |
210 | if result.lower() == 'good': |
211 | return 0, '>6w', '' | |
212 | elif result.lower() == 'warning': | |
213 | return 0, '>=2w and <=6w', '' | |
214 | elif result.lower() == 'bad': | |
215 | return 0, '<2w', '' | |
216 | else: | |
217 | return 0, 'unknown', '' | |
218 | ||
20effc67 | 219 | def _reset_device_life_expectancy(self, device_id: str) -> int: |
11fdf7f2 TL |
220 | result = CommandResult('') |
221 | self.send_command(result, 'mon', '', json.dumps({ | |
222 | 'prefix': 'device rm-life-expectancy', | |
223 | 'devid': device_id | |
224 | }), '') | |
225 | ret, _, outs = result.wait() | |
226 | if ret != 0: | |
227 | self.log.error( | |
228 | 'failed to reset device life expectancy, %s' % outs) | |
229 | return ret | |
230 | ||
20effc67 TL |
231 | def _set_device_life_expectancy(self, |
232 | device_id: str, | |
233 | from_date: str, | |
234 | to_date: Optional[str] = None) -> int: | |
11fdf7f2 TL |
235 | result = CommandResult('') |
236 | ||
237 | if to_date is None: | |
238 | self.send_command(result, 'mon', '', json.dumps({ | |
239 | 'prefix': 'device set-life-expectancy', | |
240 | 'devid': device_id, | |
241 | 'from': from_date | |
242 | }), '') | |
243 | else: | |
244 | self.send_command(result, 'mon', '', json.dumps({ | |
245 | 'prefix': 'device set-life-expectancy', | |
246 | 'devid': device_id, | |
247 | 'from': from_date, | |
248 | 'to': to_date | |
249 | }), '') | |
250 | ret, _, outs = result.wait() | |
251 | if ret != 0: | |
252 | self.log.error( | |
253 | 'failed to set device life expectancy, %s' % outs) | |
254 | return ret | |
255 | ||
20effc67 | 256 | def predict_all_devices(self) -> Tuple[int, str, str]: |
9f95a23c | 257 | self.log.debug('predict_all_devices') |
11fdf7f2 TL |
258 | devices = self.get('devices').get('devices', []) |
259 | for devInfo in devices: | |
260 | if not devInfo.get('daemons'): | |
261 | continue | |
262 | if not devInfo.get('devid'): | |
263 | continue | |
9f95a23c | 264 | self.log.debug('%s' % devInfo) |
20effc67 | 265 | result = self._predict_life_expectancy(devInfo['devid']) |
11fdf7f2 TL |
266 | if result == 'unknown': |
267 | self._reset_device_life_expectancy(devInfo['devid']) | |
268 | continue | |
269 | predicted = int(time.time() * (1000 ** 3)) | |
270 | ||
271 | if result.lower() == 'good': | |
272 | life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS | |
20effc67 | 273 | life_expectancy_day_max = 0 |
11fdf7f2 TL |
274 | elif result.lower() == 'warning': |
275 | life_expectancy_day_min = (TIME_WEEK * 2) | |
276 | life_expectancy_day_max = (TIME_WEEK * 6) | |
277 | elif result.lower() == 'bad': | |
278 | life_expectancy_day_min = 0 | |
279 | life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS | |
280 | else: | |
20effc67 TL |
281 | predicted = 0 |
282 | life_expectancy_day_min = 0 | |
283 | life_expectancy_day_max = 0 | |
11fdf7f2 TL |
284 | |
285 | if predicted and devInfo['devid'] and life_expectancy_day_min: | |
286 | from_date = None | |
287 | to_date = None | |
288 | try: | |
20effc67 TL |
289 | assert life_expectancy_day_min |
290 | from_date = self._convert_timestamp(predicted, life_expectancy_day_min) | |
11fdf7f2 TL |
291 | |
292 | if life_expectancy_day_max: | |
293 | to_date = self._convert_timestamp(predicted, life_expectancy_day_max) | |
294 | ||
295 | self._set_device_life_expectancy(devInfo['devid'], from_date, to_date) | |
296 | self._logger.info( | |
297 | 'succeed to set device {} life expectancy from: {}, to: {}'.format( | |
298 | devInfo['devid'], from_date, to_date)) | |
299 | except Exception as e: | |
300 | self._logger.error( | |
301 | 'failed to set device {} life expectancy from: {}, to: {}, {}'.format( | |
302 | devInfo['devid'], from_date, to_date, str(e))) | |
303 | else: | |
304 | self._reset_device_life_expectancy(devInfo['devid']) | |
305 | return 0, 'succeed to predicted all devices', '' |