]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/diskprediction_local/module.py
update download target update for octopus release
[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / module.py
CommitLineData
11fdf7f2
TL
1"""
2diskprediction with local predictor
3"""
4import json
5import datetime
6import _strptime
7from threading import Event
8import time
9
10from mgr_module import MgrModule, CommandResult
11
12
13TIME_FORMAT = '%Y%m%d-%H%M%S'
14TIME_DAYS = 24*60*60
15TIME_WEEK = TIME_DAYS * 7
16
17
18class Module(MgrModule):
19 MODULE_OPTIONS = [
20 {
21 'name': 'sleep_interval',
22 'default': str(600),
23 },
24 {
25 'name': 'predict_interval',
26 'default': str(86400),
27 },
28 ]
29
30 COMMANDS = []
31
32 def __init__(self, *args, **kwargs):
33 super(Module, self).__init__(*args, **kwargs)
34 # options
35 for opt in self.MODULE_OPTIONS:
36 setattr(self, opt['name'], opt['default'])
37 # other
38 self._run = True
39 self._event = Event()
40
41 def config_notify(self):
42 for opt in self.MODULE_OPTIONS:
43 setattr(self,
44 opt['name'],
45 self.get_module_option(opt['name']) or opt['default'])
46 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
47 if self.get_ceph_option('device_failure_prediction_mode') == 'local':
48 self._event.set()
49
50 def refresh_config(self):
51 for opt in self.MODULE_OPTIONS:
52 setattr(self,
53 opt['name'],
54 self.get_module_option(opt['name']) or opt['default'])
55 self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
56
57 def handle_command(self, _, cmd):
58 self.log.debug('handle_command cmd: %s', cmd)
59 raise NotImplementedError(cmd['prefix'])
60
61 def self_test(self):
62 ret, out, err = self.predict_all_devices()
63 assert ret == 0
64 return 0, 'self test succeed', ''
65
66 def serve(self):
67 self.log.info('Starting diskprediction local module')
68 self.config_notify()
69 last_predicted = None
70 ls = self.get_store('last_predicted')
71 if ls:
72 try:
73 last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT)
74 except ValueError:
75 pass
76 self.log.debug('Last predicted %s', last_predicted)
77
78 while self._run:
79 self.refresh_config()
80 mode = self.get_ceph_option('device_failure_prediction_mode')
81 if mode == 'local':
82 now = datetime.datetime.utcnow()
83 if not last_predicted:
84 next_predicted = now
85 else:
86 predicted_frequency = int(self.predict_interval) or 86400
87 seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds()
88 seconds -= seconds % predicted_frequency
89 seconds += predicted_frequency
90 next_predicted = datetime.datetime.utcfromtimestamp(seconds)
91 if last_predicted:
92 self.log.debug('Last scrape %s, next scrape due %s',
93 last_predicted.strftime(TIME_FORMAT),
94 next_predicted.strftime(TIME_FORMAT))
95 else:
96 self.log.debug('Last scrape never, next scrape due %s',
97 next_predicted.strftime(TIME_FORMAT))
98 if now >= next_predicted:
99 self.predict_all_devices()
100 last_predicted = now
101 self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT))
102
103 sleep_interval = int(self.sleep_interval) or 60
104 self.log.debug('Sleeping for %d seconds', sleep_interval)
105 self._event.wait(sleep_interval)
106 self._event.clear()
107
108 def shutdown(self):
109 self.log.info('Stopping')
110 self._run = False
111 self._event.set()
112
113 @staticmethod
114 def _convert_timestamp(predicted_timestamp, life_expectancy_day):
115 """
116 :param predicted_timestamp: unit is nanoseconds
117 :param life_expectancy_day: unit is seconds
118 :return:
119 date format '%Y-%m-%d' ex. 2018-01-01
120 """
121 return datetime.datetime.fromtimestamp(
122 predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d')
123
124 def _predict_life_expentancy(self, devid):
125 predicted_result = ''
126 from .predictor import get_diskfailurepredictor_path, DiskFailurePredictor
127 health_data = {}
128 predict_datas = []
129 try:
130 r, outb, outs = self.remote('devicehealth', 'show_device_metrics', devid=devid, sample='')
131 if r != 0:
132 self.log.error('failed to get device %s health', devid)
133 health_data = {}
134 else:
135 health_data = json.loads(outb)
136 except Exception as e:
137 self.log.error('failed to get device %s health data due to %s', devid, str(e))
138
139 obj_predictor = DiskFailurePredictor()
140 obj_predictor.initialize("{}/models".format(get_diskfailurepredictor_path()))
141
142 if len(health_data) >= 6:
143 o_keys = sorted(health_data.keys(), reverse=True)
144 for o_key in o_keys:
145 dev_smart = {}
146 s_val = health_data[o_key]
147 ata_smart = s_val.get('ata_smart_attributes', {})
148 for attr in ata_smart.get('table', []):
149 if attr.get('raw', {}).get('string'):
150 if str(attr.get('raw', {}).get('string', '0')).isdigit():
151 dev_smart['smart_%s_raw' % attr.get('id')] = \
152 int(attr.get('raw', {}).get('string', '0'))
153 else:
154 if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit():
155 dev_smart['smart_%s_raw' % attr.get('id')] = \
156 int(attr.get('raw', {}).get('string',
157 '0').split(' ')[0])
158 else:
159 dev_smart['smart_%s_raw' % attr.get('id')] = \
160 attr.get('raw', {}).get('value', 0)
161 if s_val.get('power_on_time', {}).get('hours') is not None:
162 dev_smart['smart_9_raw'] = int(s_val['power_on_time']['hours'])
163 if dev_smart:
164 predict_datas.append(dev_smart)
165 if len(predict_datas) >= 12:
166 break
167 else:
168 self.log.error('unable to predict device due to health data records less than 6 days')
169
170 if predict_datas:
171 predicted_result = obj_predictor.predict(predict_datas)
172 return predicted_result
173
174 def predict_life_expectancy(self, devid):
175 result = self._predict_life_expentancy(devid)
176 if result.lower() == 'good':
177 return 0, '>6w', ''
178 elif result.lower() == 'warning':
179 return 0, '>=2w and <=6w', ''
180 elif result.lower() == 'bad':
181 return 0, '<2w', ''
182 else:
183 return 0, 'unknown', ''
184
185 def _reset_device_life_expectancy(self, device_id):
186 result = CommandResult('')
187 self.send_command(result, 'mon', '', json.dumps({
188 'prefix': 'device rm-life-expectancy',
189 'devid': device_id
190 }), '')
191 ret, _, outs = result.wait()
192 if ret != 0:
193 self.log.error(
194 'failed to reset device life expectancy, %s' % outs)
195 return ret
196
197 def _set_device_life_expectancy(self, device_id, from_date, to_date=None):
198 result = CommandResult('')
199
200 if to_date is None:
201 self.send_command(result, 'mon', '', json.dumps({
202 'prefix': 'device set-life-expectancy',
203 'devid': device_id,
204 'from': from_date
205 }), '')
206 else:
207 self.send_command(result, 'mon', '', json.dumps({
208 'prefix': 'device set-life-expectancy',
209 'devid': device_id,
210 'from': from_date,
211 'to': to_date
212 }), '')
213 ret, _, outs = result.wait()
214 if ret != 0:
215 self.log.error(
216 'failed to set device life expectancy, %s' % outs)
217 return ret
218
219 def predict_all_devices(self):
220 devices = self.get('devices').get('devices', [])
221 for devInfo in devices:
222 if not devInfo.get('daemons'):
223 continue
224 if not devInfo.get('devid'):
225 continue
226 result = self._predict_life_expentancy(devInfo['devid'])
227 if result == 'unknown':
228 self._reset_device_life_expectancy(devInfo['devid'])
229 continue
230 predicted = int(time.time() * (1000 ** 3))
231
232 if result.lower() == 'good':
233 life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS
234 life_expectancy_day_max = None
235 elif result.lower() == 'warning':
236 life_expectancy_day_min = (TIME_WEEK * 2)
237 life_expectancy_day_max = (TIME_WEEK * 6)
238 elif result.lower() == 'bad':
239 life_expectancy_day_min = 0
240 life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS
241 else:
242 predicted = None
243 life_expectancy_day_min = None
244 life_expectancy_day_max = None
245
246 if predicted and devInfo['devid'] and life_expectancy_day_min:
247 from_date = None
248 to_date = None
249 try:
250 if life_expectancy_day_min:
251 from_date = self._convert_timestamp(predicted, life_expectancy_day_min)
252
253 if life_expectancy_day_max:
254 to_date = self._convert_timestamp(predicted, life_expectancy_day_max)
255
256 self._set_device_life_expectancy(devInfo['devid'], from_date, to_date)
257 self._logger.info(
258 'succeed to set device {} life expectancy from: {}, to: {}'.format(
259 devInfo['devid'], from_date, to_date))
260 except Exception as e:
261 self._logger.error(
262 'failed to set device {} life expectancy from: {}, to: {}, {}'.format(
263 devInfo['devid'], from_date, to_date, str(e)))
264 else:
265 self._reset_device_life_expectancy(devInfo['devid'])
266 return 0, 'succeed to predicted all devices', ''