]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/diskprediction_local/module.py
import quincy beta 17.1.0
[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / module.py
CommitLineData
11fdf7f2
TL
1"""
2diskprediction with local predictor
3"""
4import json
5import datetime
11fdf7f2
TL
6from threading import Event
7import time
20effc67
TL
8from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
9from mgr_module import CommandResult, MgrModule, Option
9f95a23c
TL
10# Importing scipy early appears to avoid a future deadlock when
11# we try to do
12#
13# from .predictor import get_diskfailurepredictor_path
14#
15# in a command thread. See https://tracker.ceph.com/issues/42764
20effc67
TL
16import scipy # noqa: ignore=F401
17from .predictor import DevSmartT, Predictor, get_diskfailurepredictor_path
9f95a23c 18
11fdf7f2
TL
19
20TIME_FORMAT = '%Y%m%d-%H%M%S'
20effc67 21TIME_DAYS = 24 * 60 * 60
11fdf7f2
TL
22TIME_WEEK = TIME_DAYS * 7
23
24
25class Module(MgrModule):
26 MODULE_OPTIONS = [
20effc67
TL
27 Option(name='sleep_interval',
28 default=600),
29 Option(name='predict_interval',
30 default=86400),
31 Option(name='predictor_model',
32 default='prophetstor')
11fdf7f2
TL
33 ]
34
20effc67 35 def __init__(self, *args: Any, **kwargs: Any) -> None:
11fdf7f2
TL
36 super(Module, self).__init__(*args, **kwargs)
37 # options
38 for opt in self.MODULE_OPTIONS:
39 setattr(self, opt['name'], opt['default'])
40 # other
41 self._run = True
42 self._event = Event()
20effc67
TL
43 # for mypy which does not run the code
44 if TYPE_CHECKING:
45 self.sleep_interval = 0
46 self.predict_interval = 0
47 self.predictor_model = ''
11fdf7f2 48
20effc67 49 def config_notify(self) -> None:
11fdf7f2
TL
50 for opt in self.MODULE_OPTIONS:
51 setattr(self,
52 opt['name'],
9f95a23c 53 self.get_module_option(opt['name']))
11fdf7f2
TL
54 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
55 if self.get_ceph_option('device_failure_prediction_mode') == 'local':
56 self._event.set()
57
20effc67 58 def refresh_config(self) -> None:
11fdf7f2
TL
59 for opt in self.MODULE_OPTIONS:
60 setattr(self,
61 opt['name'],
801d1391 62 self.get_module_option(opt['name']))
11fdf7f2
TL
63 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
64
20effc67 65 def self_test(self) -> None:
9f95a23c 66 self.log.debug('self_test enter')
11fdf7f2
TL
67 ret, out, err = self.predict_all_devices()
68 assert ret == 0
11fdf7f2 69
20effc67 70 def serve(self) -> None:
11fdf7f2
TL
71 self.log.info('Starting diskprediction local module')
72 self.config_notify()
73 last_predicted = None
74 ls = self.get_store('last_predicted')
75 if ls:
76 try:
77 last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT)
78 except ValueError:
79 pass
80 self.log.debug('Last predicted %s', last_predicted)
81
82 while self._run:
83 self.refresh_config()
84 mode = self.get_ceph_option('device_failure_prediction_mode')
85 if mode == 'local':
86 now = datetime.datetime.utcnow()
87 if not last_predicted:
88 next_predicted = now
89 else:
20effc67 90 predicted_frequency = self.predict_interval or 86400
11fdf7f2
TL
91 seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds()
92 seconds -= seconds % predicted_frequency
93 seconds += predicted_frequency
94 next_predicted = datetime.datetime.utcfromtimestamp(seconds)
20effc67
TL
95 self.log.debug('Last scrape %s, next scrape due %s',
96 last_predicted.strftime(TIME_FORMAT),
97 next_predicted.strftime(TIME_FORMAT))
11fdf7f2
TL
98 if now >= next_predicted:
99 self.predict_all_devices()
100 last_predicted = now
101 self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT))
102
20effc67 103 sleep_interval = self.sleep_interval or 60
11fdf7f2
TL
104 self.log.debug('Sleeping for %d seconds', sleep_interval)
105 self._event.wait(sleep_interval)
106 self._event.clear()
107
20effc67 108 def shutdown(self) -> None:
11fdf7f2
TL
109 self.log.info('Stopping')
110 self._run = False
111 self._event.set()
112
113 @staticmethod
20effc67 114 def _convert_timestamp(predicted_timestamp: int, life_expectancy_day: int) -> str:
11fdf7f2
TL
115 """
116 :param predicted_timestamp: unit is nanoseconds
117 :param life_expectancy_day: unit is seconds
118 :return:
119 date format '%Y-%m-%d' ex. 2018-01-01
120 """
121 return datetime.datetime.fromtimestamp(
122 predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d')
123
20effc67 124 def _predict_life_expectancy(self, devid: str) -> str:
11fdf7f2 125 predicted_result = ''
20effc67
TL
126 health_data: Dict[str, Dict[str, Any]] = {}
127 predict_datas: List[DevSmartT] = []
11fdf7f2 128 try:
20effc67
TL
129 r, outb, outs = self.remote(
130 'devicehealth', 'show_device_metrics', devid=devid, sample='')
11fdf7f2
TL
131 if r != 0:
132 self.log.error('failed to get device %s health', devid)
133 health_data = {}
134 else:
135 health_data = json.loads(outb)
136 except Exception as e:
137 self.log.error('failed to get device %s health data due to %s', devid, str(e))
138
9f95a23c 139 # initialize appropriate disk failure predictor model
20effc67
TL
140 obj_predictor = Predictor.create(self.predictor_model)
141 if obj_predictor is None:
9f95a23c
TL
142 self.log.error('invalid value received for MODULE_OPTIONS.predictor_model')
143 return predicted_result
20effc67
TL
144 try:
145 obj_predictor.initialize(
146 "{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model))
147 except Exception as e:
148 self.log.error('Error initializing predictor: %s', e)
149 return predicted_result
11fdf7f2
TL
150
151 if len(health_data) >= 6:
152 o_keys = sorted(health_data.keys(), reverse=True)
153 for o_key in o_keys:
9f95a23c 154 # get values for current day (?)
11fdf7f2
TL
155 dev_smart = {}
156 s_val = health_data[o_key]
9f95a23c
TL
157
158 # add all smart attributes
11fdf7f2
TL
159 ata_smart = s_val.get('ata_smart_attributes', {})
160 for attr in ata_smart.get('table', []):
9f95a23c
TL
161 # get raw smart values
162 if attr.get('raw', {}).get('string') is not None:
11fdf7f2
TL
163 if str(attr.get('raw', {}).get('string', '0')).isdigit():
164 dev_smart['smart_%s_raw' % attr.get('id')] = \
165 int(attr.get('raw', {}).get('string', '0'))
166 else:
167 if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit():
168 dev_smart['smart_%s_raw' % attr.get('id')] = \
169 int(attr.get('raw', {}).get('string',
170 '0').split(' ')[0])
171 else:
172 dev_smart['smart_%s_raw' % attr.get('id')] = \
173 attr.get('raw', {}).get('value', 0)
9f95a23c
TL
174 # get normalized smart values
175 if attr.get('value') is not None:
176 dev_smart['smart_%s_normalized' % attr.get('id')] = \
20effc67 177 attr.get('value')
9f95a23c 178 # add power on hours manually if not available in smart attributes
20effc67
TL
179 power_on_time = s_val.get('power_on_time', {}).get('hours')
180 if power_on_time is not None:
181 dev_smart['smart_9_raw'] = int(power_on_time)
9f95a23c 182 # add device capacity
20effc67
TL
183 user_capacity = s_val.get('user_capacity', {}).get('bytes')
184 if user_capacity is not None:
185 dev_smart['user_capacity'] = user_capacity
186 else:
187 self.log.debug('user_capacity not found in smart attributes list')
9f95a23c 188 # add device model
20effc67
TL
189 model_name = s_val.get('model_name')
190 if model_name is not None:
191 dev_smart['model_name'] = model_name
9f95a23c 192 # add vendor
20effc67
TL
193 vendor = s_val.get('vendor')
194 if vendor is not None:
195 dev_smart['vendor'] = vendor
9f95a23c 196 # if smart data was found, then add that to list
11fdf7f2
TL
197 if dev_smart:
198 predict_datas.append(dev_smart)
199 if len(predict_datas) >= 12:
200 break
201 else:
202 self.log.error('unable to predict device due to health data records less than 6 days')
203
f6b5b4d7 204 if len(predict_datas) >= 6:
11fdf7f2
TL
205 predicted_result = obj_predictor.predict(predict_datas)
206 return predicted_result
207
20effc67
TL
208 def predict_life_expectancy(self, devid: str) -> Tuple[int, str, str]:
209 result = self._predict_life_expectancy(devid)
11fdf7f2
TL
210 if result.lower() == 'good':
211 return 0, '>6w', ''
212 elif result.lower() == 'warning':
213 return 0, '>=2w and <=6w', ''
214 elif result.lower() == 'bad':
215 return 0, '<2w', ''
216 else:
217 return 0, 'unknown', ''
218
20effc67 219 def _reset_device_life_expectancy(self, device_id: str) -> int:
11fdf7f2
TL
220 result = CommandResult('')
221 self.send_command(result, 'mon', '', json.dumps({
222 'prefix': 'device rm-life-expectancy',
223 'devid': device_id
224 }), '')
225 ret, _, outs = result.wait()
226 if ret != 0:
227 self.log.error(
228 'failed to reset device life expectancy, %s' % outs)
229 return ret
230
20effc67
TL
231 def _set_device_life_expectancy(self,
232 device_id: str,
233 from_date: str,
234 to_date: Optional[str] = None) -> int:
11fdf7f2
TL
235 result = CommandResult('')
236
237 if to_date is None:
238 self.send_command(result, 'mon', '', json.dumps({
239 'prefix': 'device set-life-expectancy',
240 'devid': device_id,
241 'from': from_date
242 }), '')
243 else:
244 self.send_command(result, 'mon', '', json.dumps({
245 'prefix': 'device set-life-expectancy',
246 'devid': device_id,
247 'from': from_date,
248 'to': to_date
249 }), '')
250 ret, _, outs = result.wait()
251 if ret != 0:
252 self.log.error(
253 'failed to set device life expectancy, %s' % outs)
254 return ret
255
20effc67 256 def predict_all_devices(self) -> Tuple[int, str, str]:
9f95a23c 257 self.log.debug('predict_all_devices')
11fdf7f2
TL
258 devices = self.get('devices').get('devices', [])
259 for devInfo in devices:
260 if not devInfo.get('daemons'):
261 continue
262 if not devInfo.get('devid'):
263 continue
9f95a23c 264 self.log.debug('%s' % devInfo)
20effc67 265 result = self._predict_life_expectancy(devInfo['devid'])
11fdf7f2
TL
266 if result == 'unknown':
267 self._reset_device_life_expectancy(devInfo['devid'])
268 continue
269 predicted = int(time.time() * (1000 ** 3))
270
271 if result.lower() == 'good':
272 life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS
20effc67 273 life_expectancy_day_max = 0
11fdf7f2
TL
274 elif result.lower() == 'warning':
275 life_expectancy_day_min = (TIME_WEEK * 2)
276 life_expectancy_day_max = (TIME_WEEK * 6)
277 elif result.lower() == 'bad':
278 life_expectancy_day_min = 0
279 life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS
280 else:
20effc67
TL
281 predicted = 0
282 life_expectancy_day_min = 0
283 life_expectancy_day_max = 0
11fdf7f2
TL
284
285 if predicted and devInfo['devid'] and life_expectancy_day_min:
286 from_date = None
287 to_date = None
288 try:
20effc67
TL
289 assert life_expectancy_day_min
290 from_date = self._convert_timestamp(predicted, life_expectancy_day_min)
11fdf7f2
TL
291
292 if life_expectancy_day_max:
293 to_date = self._convert_timestamp(predicted, life_expectancy_day_max)
294
295 self._set_device_life_expectancy(devInfo['devid'], from_date, to_date)
296 self._logger.info(
297 'succeed to set device {} life expectancy from: {}, to: {}'.format(
298 devInfo['devid'], from_date, to_date))
299 except Exception as e:
300 self._logger.error(
301 'failed to set device {} life expectancy from: {}, to: {}, {}'.format(
302 devInfo['devid'], from_date, to_date, str(e)))
303 else:
304 self._reset_device_life_expectancy(devInfo['devid'])
305 return 0, 'succeed to predicted all devices', ''