]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/diskprediction_local/module.py
import 15.2.5
[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / module.py
1 """
2 diskprediction with local predictor
3 """
4 import json
5 import datetime
6 import _strptime
7 from threading import Event
8 import time
9
10 from mgr_module import MgrModule, CommandResult
11
12 # Importing scipy early appears to avoid a future deadlock when
13 # we try to do
14 #
15 # from .predictor import get_diskfailurepredictor_path
16 #
17 # in a command thread. See https://tracker.ceph.com/issues/42764
18 import scipy
19
20
21 TIME_FORMAT = '%Y%m%d-%H%M%S'
22 TIME_DAYS = 24*60*60
23 TIME_WEEK = TIME_DAYS * 7
24
25
26 class Module(MgrModule):
27 MODULE_OPTIONS = [
28 {
29 'name': 'sleep_interval',
30 'default': str(600),
31 },
32 {
33 'name': 'predict_interval',
34 'default': str(86400),
35 },
36 {
37 'name': 'predictor_model',
38 'default': 'prophetstor',
39 },
40 ]
41
42 COMMANDS = []
43
44 def __init__(self, *args, **kwargs):
45 super(Module, self).__init__(*args, **kwargs)
46 # options
47 for opt in self.MODULE_OPTIONS:
48 setattr(self, opt['name'], opt['default'])
49 # other
50 self._run = True
51 self._event = Event()
52
53 def config_notify(self):
54 for opt in self.MODULE_OPTIONS:
55 setattr(self,
56 opt['name'],
57 self.get_module_option(opt['name']))
58 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
59 if self.get_ceph_option('device_failure_prediction_mode') == 'local':
60 self._event.set()
61
62 def refresh_config(self):
63 for opt in self.MODULE_OPTIONS:
64 setattr(self,
65 opt['name'],
66 self.get_module_option(opt['name']))
67 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
68
69 def handle_command(self, _, cmd):
70 self.log.debug('handle_command cmd: %s', cmd)
71 raise NotImplementedError(cmd['prefix'])
72
73 def self_test(self):
74 self.log.debug('self_test enter')
75 ret, out, err = self.predict_all_devices()
76 assert ret == 0
77 return 0, 'self test succeed', ''
78
79 def serve(self):
80 self.log.info('Starting diskprediction local module')
81 self.config_notify()
82 last_predicted = None
83 ls = self.get_store('last_predicted')
84 if ls:
85 try:
86 last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT)
87 except ValueError:
88 pass
89 self.log.debug('Last predicted %s', last_predicted)
90
91 while self._run:
92 self.refresh_config()
93 mode = self.get_ceph_option('device_failure_prediction_mode')
94 if mode == 'local':
95 now = datetime.datetime.utcnow()
96 if not last_predicted:
97 next_predicted = now
98 else:
99 predicted_frequency = int(self.predict_interval) or 86400
100 seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds()
101 seconds -= seconds % predicted_frequency
102 seconds += predicted_frequency
103 next_predicted = datetime.datetime.utcfromtimestamp(seconds)
104 if last_predicted:
105 self.log.debug('Last scrape %s, next scrape due %s',
106 last_predicted.strftime(TIME_FORMAT),
107 next_predicted.strftime(TIME_FORMAT))
108 else:
109 self.log.debug('Last scrape never, next scrape due %s',
110 next_predicted.strftime(TIME_FORMAT))
111 if now >= next_predicted:
112 self.predict_all_devices()
113 last_predicted = now
114 self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT))
115
116 sleep_interval = int(self.sleep_interval) or 60
117 self.log.debug('Sleeping for %d seconds', sleep_interval)
118 self._event.wait(sleep_interval)
119 self._event.clear()
120
121 def shutdown(self):
122 self.log.info('Stopping')
123 self._run = False
124 self._event.set()
125
126 @staticmethod
127 def _convert_timestamp(predicted_timestamp, life_expectancy_day):
128 """
129 :param predicted_timestamp: unit is nanoseconds
130 :param life_expectancy_day: unit is seconds
131 :return:
132 date format '%Y-%m-%d' ex. 2018-01-01
133 """
134 return datetime.datetime.fromtimestamp(
135 predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d')
136
137 def _predict_life_expentancy(self, devid):
138 predicted_result = ''
139 health_data = {}
140 predict_datas = []
141 try:
142 r, outb, outs = self.remote('devicehealth', 'show_device_metrics', devid=devid, sample='')
143 if r != 0:
144 self.log.error('failed to get device %s health', devid)
145 health_data = {}
146 else:
147 health_data = json.loads(outb)
148 except Exception as e:
149 self.log.error('failed to get device %s health data due to %s', devid, str(e))
150
151 # initialize appropriate disk failure predictor model
152 from .predictor import get_diskfailurepredictor_path
153 if self.predictor_model == 'prophetstor':
154 from .predictor import PSDiskFailurePredictor
155 obj_predictor = PSDiskFailurePredictor()
156 ret = obj_predictor.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model))
157 if ret is not None:
158 self.log.error('Error initializing predictor')
159 return predicted_result
160 elif self.predictor_model == 'redhat':
161 from .predictor import RHDiskFailurePredictor
162 obj_predictor = RHDiskFailurePredictor()
163 ret = obj_predictor.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model))
164 if ret is not None:
165 self.log.error('Error initializing predictor')
166 return predicted_result
167 else:
168 self.log.error('invalid value received for MODULE_OPTIONS.predictor_model')
169 return predicted_result
170
171 if len(health_data) >= 6:
172 o_keys = sorted(health_data.keys(), reverse=True)
173 for o_key in o_keys:
174 # get values for current day (?)
175 dev_smart = {}
176 s_val = health_data[o_key]
177
178 # add all smart attributes
179 ata_smart = s_val.get('ata_smart_attributes', {})
180 for attr in ata_smart.get('table', []):
181 # get raw smart values
182 if attr.get('raw', {}).get('string') is not None:
183 if str(attr.get('raw', {}).get('string', '0')).isdigit():
184 dev_smart['smart_%s_raw' % attr.get('id')] = \
185 int(attr.get('raw', {}).get('string', '0'))
186 else:
187 if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit():
188 dev_smart['smart_%s_raw' % attr.get('id')] = \
189 int(attr.get('raw', {}).get('string',
190 '0').split(' ')[0])
191 else:
192 dev_smart['smart_%s_raw' % attr.get('id')] = \
193 attr.get('raw', {}).get('value', 0)
194 # get normalized smart values
195 if attr.get('value') is not None:
196 dev_smart['smart_%s_normalized' % attr.get('id')] = \
197 attr.get('value')
198 # add power on hours manually if not available in smart attributes
199 if s_val.get('power_on_time', {}).get('hours') is not None:
200 dev_smart['smart_9_raw'] = int(s_val['power_on_time']['hours'])
201 # add device capacity
202 if s_val.get('user_capacity') is not None:
203 if s_val.get('user_capacity').get('bytes') is not None:
204 dev_smart['user_capacity'] = s_val.get('user_capacity').get('bytes')
205 else:
206 self.log.debug('user_capacity not found in smart attributes list')
207 # add device model
208 if s_val.get('model_name') is not None:
209 dev_smart['model_name'] = s_val.get('model_name')
210 # add vendor
211 if s_val.get('vendor') is not None:
212 dev_smart['vendor'] = s_val.get('vendor')
213 # if smart data was found, then add that to list
214 if dev_smart:
215 predict_datas.append(dev_smart)
216 if len(predict_datas) >= 12:
217 break
218 else:
219 self.log.error('unable to predict device due to health data records less than 6 days')
220
221 if len(predict_datas) >= 6:
222 predicted_result = obj_predictor.predict(predict_datas)
223 return predicted_result
224
225 def predict_life_expectancy(self, devid):
226 result = self._predict_life_expentancy(devid)
227 if result.lower() == 'good':
228 return 0, '>6w', ''
229 elif result.lower() == 'warning':
230 return 0, '>=2w and <=6w', ''
231 elif result.lower() == 'bad':
232 return 0, '<2w', ''
233 else:
234 return 0, 'unknown', ''
235
236 def _reset_device_life_expectancy(self, device_id):
237 result = CommandResult('')
238 self.send_command(result, 'mon', '', json.dumps({
239 'prefix': 'device rm-life-expectancy',
240 'devid': device_id
241 }), '')
242 ret, _, outs = result.wait()
243 if ret != 0:
244 self.log.error(
245 'failed to reset device life expectancy, %s' % outs)
246 return ret
247
248 def _set_device_life_expectancy(self, device_id, from_date, to_date=None):
249 result = CommandResult('')
250
251 if to_date is None:
252 self.send_command(result, 'mon', '', json.dumps({
253 'prefix': 'device set-life-expectancy',
254 'devid': device_id,
255 'from': from_date
256 }), '')
257 else:
258 self.send_command(result, 'mon', '', json.dumps({
259 'prefix': 'device set-life-expectancy',
260 'devid': device_id,
261 'from': from_date,
262 'to': to_date
263 }), '')
264 ret, _, outs = result.wait()
265 if ret != 0:
266 self.log.error(
267 'failed to set device life expectancy, %s' % outs)
268 return ret
269
270 def predict_all_devices(self):
271 self.log.debug('predict_all_devices')
272 devices = self.get('devices').get('devices', [])
273 for devInfo in devices:
274 if not devInfo.get('daemons'):
275 continue
276 if not devInfo.get('devid'):
277 continue
278 self.log.debug('%s' % devInfo)
279 result = self._predict_life_expentancy(devInfo['devid'])
280 if result == 'unknown':
281 self._reset_device_life_expectancy(devInfo['devid'])
282 continue
283 predicted = int(time.time() * (1000 ** 3))
284
285 if result.lower() == 'good':
286 life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS
287 life_expectancy_day_max = None
288 elif result.lower() == 'warning':
289 life_expectancy_day_min = (TIME_WEEK * 2)
290 life_expectancy_day_max = (TIME_WEEK * 6)
291 elif result.lower() == 'bad':
292 life_expectancy_day_min = 0
293 life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS
294 else:
295 predicted = None
296 life_expectancy_day_min = None
297 life_expectancy_day_max = None
298
299 if predicted and devInfo['devid'] and life_expectancy_day_min:
300 from_date = None
301 to_date = None
302 try:
303 if life_expectancy_day_min:
304 from_date = self._convert_timestamp(predicted, life_expectancy_day_min)
305
306 if life_expectancy_day_max:
307 to_date = self._convert_timestamp(predicted, life_expectancy_day_max)
308
309 self._set_device_life_expectancy(devInfo['devid'], from_date, to_date)
310 self._logger.info(
311 'succeed to set device {} life expectancy from: {}, to: {}'.format(
312 devInfo['devid'], from_date, to_date))
313 except Exception as e:
314 self._logger.error(
315 'failed to set device {} life expectancy from: {}, to: {}, {}'.format(
316 devInfo['devid'], from_date, to_date, str(e)))
317 else:
318 self._reset_device_life_expectancy(devInfo['devid'])
319 return 0, 'succeed to predicted all devices', ''