2 diskprediction with local predictor
7 from threading
import Event
10 from mgr_module
import MgrModule
, CommandResult
12 # Importing scipy early appears to avoid a future deadlock when
15 # from .predictor import get_diskfailurepredictor_path
17 # in a command thread. See https://tracker.ceph.com/issues/42764
21 TIME_FORMAT
= '%Y%m%d-%H%M%S'
23 TIME_WEEK
= TIME_DAYS
* 7
26 class Module(MgrModule
):
29 'name': 'sleep_interval',
33 'name': 'predict_interval',
34 'default': str(86400),
37 'name': 'predictor_model',
38 'default': 'prophetstor',
44 def __init__(self
, *args
, **kwargs
):
45 super(Module
, self
).__init
__(*args
, **kwargs
)
47 for opt
in self
.MODULE_OPTIONS
:
48 setattr(self
, opt
['name'], opt
['default'])
53 def config_notify(self
):
54 for opt
in self
.MODULE_OPTIONS
:
57 self
.get_module_option(opt
['name']))
58 self
.log
.debug(' %s = %s', opt
['name'], getattr(self
, opt
['name']))
59 if self
.get_ceph_option('device_failure_prediction_mode') == 'local':
62 def refresh_config(self
):
63 for opt
in self
.MODULE_OPTIONS
:
66 self
.get_module_option(opt
['name']))
67 self
.log
.debug(' %s = %s', opt
['name'], getattr(self
, opt
['name']))
69 def handle_command(self
, _
, cmd
):
70 self
.log
.debug('handle_command cmd: %s', cmd
)
71 raise NotImplementedError(cmd
['prefix'])
74 self
.log
.debug('self_test enter')
75 ret
, out
, err
= self
.predict_all_devices()
77 return 0, 'self test succeed', ''
80 self
.log
.info('Starting diskprediction local module')
83 ls
= self
.get_store('last_predicted')
86 last_predicted
= datetime
.datetime
.strptime(ls
, TIME_FORMAT
)
89 self
.log
.debug('Last predicted %s', last_predicted
)
93 mode
= self
.get_ceph_option('device_failure_prediction_mode')
95 now
= datetime
.datetime
.utcnow()
96 if not last_predicted
:
99 predicted_frequency
= int(self
.predict_interval
) or 86400
100 seconds
= (last_predicted
- datetime
.datetime
.utcfromtimestamp(0)).total_seconds()
101 seconds
-= seconds
% predicted_frequency
102 seconds
+= predicted_frequency
103 next_predicted
= datetime
.datetime
.utcfromtimestamp(seconds
)
105 self
.log
.debug('Last scrape %s, next scrape due %s',
106 last_predicted
.strftime(TIME_FORMAT
),
107 next_predicted
.strftime(TIME_FORMAT
))
109 self
.log
.debug('Last scrape never, next scrape due %s',
110 next_predicted
.strftime(TIME_FORMAT
))
111 if now
>= next_predicted
:
112 self
.predict_all_devices()
114 self
.set_store('last_predicted', last_predicted
.strftime(TIME_FORMAT
))
116 sleep_interval
= int(self
.sleep_interval
) or 60
117 self
.log
.debug('Sleeping for %d seconds', sleep_interval
)
118 self
._event
.wait(sleep_interval
)
122 self
.log
.info('Stopping')
127 def _convert_timestamp(predicted_timestamp
, life_expectancy_day
):
129 :param predicted_timestamp: unit is nanoseconds
130 :param life_expectancy_day: unit is seconds
132 date format '%Y-%m-%d' ex. 2018-01-01
134 return datetime
.datetime
.fromtimestamp(
135 predicted_timestamp
/ (1000 ** 3) + life_expectancy_day
).strftime('%Y-%m-%d')
137 def _predict_life_expentancy(self
, devid
):
138 predicted_result
= ''
142 r
, outb
, outs
= self
.remote('devicehealth', 'show_device_metrics', devid
=devid
, sample
='')
144 self
.log
.error('failed to get device %s health', devid
)
147 health_data
= json
.loads(outb
)
148 except Exception as e
:
149 self
.log
.error('failed to get device %s health data due to %s', devid
, str(e
))
151 # initialize appropriate disk failure predictor model
152 from .predictor
import get_diskfailurepredictor_path
153 if self
.predictor_model
== 'prophetstor':
154 from .predictor
import PSDiskFailurePredictor
155 obj_predictor
= PSDiskFailurePredictor()
156 ret
= obj_predictor
.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self
.predictor_model
))
158 self
.log
.error('Error initializing predictor')
159 return predicted_result
160 elif self
.predictor_model
== 'redhat':
161 from .predictor
import RHDiskFailurePredictor
162 obj_predictor
= RHDiskFailurePredictor()
163 ret
= obj_predictor
.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self
.predictor_model
))
165 self
.log
.error('Error initializing predictor')
166 return predicted_result
168 self
.log
.error('invalid value received for MODULE_OPTIONS.predictor_model')
169 return predicted_result
171 if len(health_data
) >= 6:
172 o_keys
= sorted(health_data
.keys(), reverse
=True)
174 # get values for current day (?)
176 s_val
= health_data
[o_key
]
178 # add all smart attributes
179 ata_smart
= s_val
.get('ata_smart_attributes', {})
180 for attr
in ata_smart
.get('table', []):
181 # get raw smart values
182 if attr
.get('raw', {}).get('string') is not None:
183 if str(attr
.get('raw', {}).get('string', '0')).isdigit():
184 dev_smart
['smart_%s_raw' % attr
.get('id')] = \
185 int(attr
.get('raw', {}).get('string', '0'))
187 if str(attr
.get('raw', {}).get('string', '0')).split(' ')[0].isdigit():
188 dev_smart
['smart_%s_raw' % attr
.get('id')] = \
189 int(attr
.get('raw', {}).get('string',
192 dev_smart
['smart_%s_raw' % attr
.get('id')] = \
193 attr
.get('raw', {}).get('value', 0)
194 # get normalized smart values
195 if attr
.get('value') is not None:
196 dev_smart
['smart_%s_normalized' % attr
.get('id')] = \
198 # add power on hours manually if not available in smart attributes
199 if s_val
.get('power_on_time', {}).get('hours') is not None:
200 dev_smart
['smart_9_raw'] = int(s_val
['power_on_time']['hours'])
201 # add device capacity
202 if s_val
.get('user_capacity') is not None:
203 if s_val
.get('user_capacity').get('bytes') is not None:
204 dev_smart
['user_capacity'] = s_val
.get('user_capacity').get('bytes')
206 self
.log
.debug('user_capacity not found in smart attributes list')
208 if s_val
.get('model_name') is not None:
209 dev_smart
['model_name'] = s_val
.get('model_name')
211 if s_val
.get('vendor') is not None:
212 dev_smart
['vendor'] = s_val
.get('vendor')
213 # if smart data was found, then add that to list
215 predict_datas
.append(dev_smart
)
216 if len(predict_datas
) >= 12:
219 self
.log
.error('unable to predict device due to health data records less than 6 days')
221 if len(predict_datas
) >= 6:
222 predicted_result
= obj_predictor
.predict(predict_datas
)
223 return predicted_result
225 def predict_life_expectancy(self
, devid
):
226 result
= self
._predict
_life
_expentancy
(devid
)
227 if result
.lower() == 'good':
229 elif result
.lower() == 'warning':
230 return 0, '>=2w and <=6w', ''
231 elif result
.lower() == 'bad':
234 return 0, 'unknown', ''
236 def _reset_device_life_expectancy(self
, device_id
):
237 result
= CommandResult('')
238 self
.send_command(result
, 'mon', '', json
.dumps({
239 'prefix': 'device rm-life-expectancy',
242 ret
, _
, outs
= result
.wait()
245 'failed to reset device life expectancy, %s' % outs
)
248 def _set_device_life_expectancy(self
, device_id
, from_date
, to_date
=None):
249 result
= CommandResult('')
252 self
.send_command(result
, 'mon', '', json
.dumps({
253 'prefix': 'device set-life-expectancy',
258 self
.send_command(result
, 'mon', '', json
.dumps({
259 'prefix': 'device set-life-expectancy',
264 ret
, _
, outs
= result
.wait()
267 'failed to set device life expectancy, %s' % outs
)
270 def predict_all_devices(self
):
271 self
.log
.debug('predict_all_devices')
272 devices
= self
.get('devices').get('devices', [])
273 for devInfo
in devices
:
274 if not devInfo
.get('daemons'):
276 if not devInfo
.get('devid'):
278 self
.log
.debug('%s' % devInfo
)
279 result
= self
._predict
_life
_expentancy
(devInfo
['devid'])
280 if result
== 'unknown':
281 self
._reset
_device
_life
_expectancy
(devInfo
['devid'])
283 predicted
= int(time
.time() * (1000 ** 3))
285 if result
.lower() == 'good':
286 life_expectancy_day_min
= (TIME_WEEK
* 6) + TIME_DAYS
287 life_expectancy_day_max
= None
288 elif result
.lower() == 'warning':
289 life_expectancy_day_min
= (TIME_WEEK
* 2)
290 life_expectancy_day_max
= (TIME_WEEK
* 6)
291 elif result
.lower() == 'bad':
292 life_expectancy_day_min
= 0
293 life_expectancy_day_max
= (TIME_WEEK
* 2) - TIME_DAYS
296 life_expectancy_day_min
= None
297 life_expectancy_day_max
= None
299 if predicted
and devInfo
['devid'] and life_expectancy_day_min
:
303 if life_expectancy_day_min
:
304 from_date
= self
._convert
_timestamp
(predicted
, life_expectancy_day_min
)
306 if life_expectancy_day_max
:
307 to_date
= self
._convert
_timestamp
(predicted
, life_expectancy_day_max
)
309 self
._set
_device
_life
_expectancy
(devInfo
['devid'], from_date
, to_date
)
311 'succeed to set device {} life expectancy from: {}, to: {}'.format(
312 devInfo
['devid'], from_date
, to_date
))
313 except Exception as e
:
315 'failed to set device {} life expectancy from: {}, to: {}, {}'.format(
316 devInfo
['devid'], from_date
, to_date
, str(e
)))
318 self
._reset
_device
_life
_expectancy
(devInfo
['devid'])
319 return 0, 'succeed to predicted all devices', ''