]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/diskprediction_local/predictor.py
import 15.2.0 Octopus source
[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / predictor.py
CommitLineData
9f95a23c 1"""Machine learning model for disk failure prediction.
11fdf7f2 2
9f95a23c
TL
3This classes defined here provide the disk failure prediction module.
4RHDiskFailurePredictor uses the models developed at the AICoE in the
5Office of the CTO at Red Hat. These models were built using the open
6source Backblaze SMART metrics dataset.
7PSDiskFailurePredictor uses the models developed by ProphetStor as an
8example.
11fdf7f2 9
9f95a23c
TL
10An instance of the predictor is initialized by providing the path to trained
11models. Then, to predict hard drive health and deduce time to failure, the
12predict function is called with 6 days worth of SMART data from the hard drive.
13It will return a string to indicate disk failure status: "Good", "Warning",
14"Bad", or "Unknown".
11fdf7f2
TL
15
16An example code is as follows:
17
9f95a23c 18>>> model = disk_failure_predictor.RHDiskFailurePredictor()
11fdf7f2
TL
19>>> status = model.initialize("./models")
20>>> if status:
21>>> model.predict(disk_days)
22'Bad'
11fdf7f2 23"""
11fdf7f2
TL
24import os
25import json
26import pickle
9f95a23c
TL
27import logging
28
29import numpy as np
30from scipy import stats
11fdf7f2
TL
31
32
33def get_diskfailurepredictor_path():
34 path = os.path.abspath(__file__)
35 dir_path = os.path.dirname(path)
36 return dir_path
37
38
9f95a23c
TL
39class RHDiskFailurePredictor(object):
40 """Disk failure prediction module developed at Red Hat
11fdf7f2
TL
41
42 This class implements a disk failure prediction module.
43 """
44
9f95a23c
TL
45 # json with manufacturer names as keys
46 # and features used for prediction as values
11fdf7f2 47 CONFIG_FILE = "config.json"
9f95a23c
TL
48 PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
49
50 # model name prefixes to identify vendor
51 MANUFACTURER_MODELNAME_PREFIXES = {
52 "WDC": "WDC",
53 "Toshiba": "Toshiba", # for cases like "Toshiba xxx"
54 "TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx"
55 "toshiba": "Toshiba", # for cases like "toshiba xxx"
56 "S": "Seagate", # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
57 "ZA": "Seagate", # for cases like "ZAxxxx"
58 "Hitachi": "Hitachi",
59 "HGST": "HGST",
60 }
61
62 LOGGER = logging.getLogger()
63
64 def __init__(self):
65 """
66 This function may throw exception due to wrong file operation.
67 """
68 self.model_dirpath = ""
69 self.model_context = {}
70
71 def initialize(self, model_dirpath):
72 """Initialize all models. Save paths of all trained model files to list
73
74 Arguments:
75 model_dirpath {str} -- path to directory of trained models
76
77 Returns:
78 str -- Error message. If all goes well, return None
79 """
80 # read config file as json, if it exists
81 config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
82 if not os.path.isfile(config_path):
83 return "Missing config file: " + config_path
84 else:
85 with open(config_path) as f_conf:
86 self.model_context = json.load(f_conf)
87
88 # ensure all manufacturers whose context is defined in config file
89 # have models and scalers saved inside model_dirpath
90 for manufacturer in self.model_context:
91 scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
92 if not os.path.isfile(scaler_path):
93 return "Missing scaler file: {}".format(scaler_path)
94 model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
95 if not os.path.isfile(model_path):
96 return "Missing model file: {}".format(model_path)
97
98 self.model_dirpath = model_dirpath
99
100 def __preprocess(self, disk_days, manufacturer):
101 """Scales and transforms input dataframe to feed it to prediction model
102
103 Arguments:
104 disk_days {list} -- list in which each element is a dictionary with key,val
105 as feature name,value respectively.
106 e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...]
107 manufacturer {str} -- manufacturer of the hard drive
108
109 Returns:
110 numpy.ndarray -- (n, d) shaped array of n days worth of data and d
111 features, scaled
112 """
113 # get the attributes that were used to train model for current manufacturer
114 try:
115 model_smart_attr = self.model_context[manufacturer]
116 except KeyError as e:
117 RHDiskFailurePredictor.LOGGER.debug(
118 "No context (SMART attributes on which model has been trained) found for manufacturer: {}".format(
119 manufacturer
120 )
121 )
122 return None
123
124 # convert to structured array, keeping only the required features
125 # assumes all data is in float64 dtype
126 try:
127 struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
128 values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
129 disk_days_sa = np.array(values, dtype=struc_dtypes)
130 except KeyError as e:
131 RHDiskFailurePredictor.LOGGER.debug(
132 "Mismatch in SMART attributes used to train model and SMART attributes available"
133 )
134 return None
135
136 # view structured array as 2d array for applying rolling window transforms
137 # do not include capacity_bytes in this. only use smart_attrs
138 disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
139 .view(np.float64).reshape(disk_days_sa.shape + (-1,))
140
141 # featurize n (6 to 12) days data - mean,std,coefficient of variation
142 # current model is trained on 6 days of data because that is what will be
143 # available at runtime
144
145 # rolling time window interval size in days
146 roll_window_size = 6
147
148 # rolling means generator
149 gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \
150 for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
151 means = np.vstack(gen)
152
153 # rolling stds generator
154 gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \
155 for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
156 stds = np.vstack(gen)
157
158 # coefficient of variation
159 cvs = stds / means
160 cvs[np.isnan(cvs)] = 0
161 featurized = np.hstack((
162 means,
163 stds,
164 cvs,
165 disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1)
166 ))
167
168 # scale features
169 scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
170 with open(scaler_path, 'rb') as f:
171 scaler = pickle.load(f)
172 featurized = scaler.transform(featurized)
173 return featurized
174
175 @staticmethod
176 def __get_manufacturer(model_name):
177 """Returns the manufacturer name for a given hard drive model name
178
179 Arguments:
180 model_name {str} -- hard drive model name
181
182 Returns:
183 str -- manufacturer name
184 """
185 for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
186 if model_name.startswith(prefix):
187 return manufacturer
188 # print error message
189 RHDiskFailurePredictor.LOGGER.debug(
190 "Could not infer manufacturer from model name {}".format(model_name)
191 )
192
193 def predict(self, disk_days):
194 # get manufacturer preferably as a smartctl attribute
195 # if not available then infer using model name
196 manufacturer = disk_days[0].get("vendor")
197 if manufacturer is None:
198 RHDiskFailurePredictor.LOGGER.debug(
199 '"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.'
200 )
201 manufacturer = RHDiskFailurePredictor.__get_manufacturer(
202 disk_days[0].get("model_name", "")
203 ).lower()
204
205 # print error message, return Unknown, and continue execution
206 if manufacturer is None:
207 RHDiskFailurePredictor.LOGGER.debug(
208 "Manufacturer could not be determiend. This may be because \
209 DiskPredictor has never encountered this manufacturer before, \
210 or the model name is not according to the manufacturer's \
211 naming conventions known to DiskPredictor"
212 )
213 return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
214
215 # preprocess for feeding to model
216 preprocessed_data = self.__preprocess(disk_days, manufacturer)
217 if preprocessed_data is None:
218 return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
219
220 # get model for current manufacturer
221 model_path = os.path.join(
222 self.model_dirpath, manufacturer + "_predictor.pkl"
223 )
224 with open(model_path, 'rb') as f:
225 model = pickle.load(f)
226
227 # use prediction for most recent day
228 # TODO: ensure that most recent day is last element and most previous day
229 # is first element in input disk_days
230 pred_class_id = model.predict(preprocessed_data)[-1]
231 return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
232
233
234class PSDiskFailurePredictor(object):
235 """Disk failure prediction developed at ProphetStor
236
237 This class implements a disk failure prediction module.
238 """
239
240 CONFIG_FILE = "config.json"
241 EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"]
11fdf7f2
TL
242
243 def __init__(self):
244 """
245 This function may throw exception due to wrong file operation.
246 """
247
248 self.model_dirpath = ""
249 self.model_context = {}
250
251 def initialize(self, model_dirpath):
252 """
253 Initialize all models.
254
255 Args: None
256
257 Returns:
258 Error message. If all goes well, return an empty string.
259
260 Raises:
261 """
262
263 config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
264 if not os.path.isfile(config_path):
265 return "Missing config file: " + config_path
266 else:
267 with open(config_path) as f_conf:
268 self.model_context = json.load(f_conf)
269
270 for model_name in self.model_context:
271 model_path = os.path.join(model_dirpath, model_name)
272
273 if not os.path.isfile(model_path):
274 return "Missing model file: " + model_path
275
276 self.model_dirpath = model_dirpath
277
278 def __preprocess(self, disk_days):
279 """
280 Preprocess disk attributes.
281
282 Args:
283 disk_days: Refer to function predict(...).
284
285 Returns:
286 new_disk_days: Processed disk days.
287 """
288
289 req_attrs = []
290 new_disk_days = []
291
9f95a23c 292 attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days])
11fdf7f2 293 for attr in attr_list:
9f95a23c
TL
294 if (
295 attr.startswith("smart_") and attr.endswith("_raw")
296 ) and attr not in self.EXCLUDED_ATTRS:
11fdf7f2
TL
297 req_attrs.append(attr)
298
299 for disk_day in disk_days:
300 new_disk_day = {}
301 for attr in req_attrs:
302 if float(disk_day[attr]) >= 0.0:
303 new_disk_day[attr] = disk_day[attr]
304
305 new_disk_days.append(new_disk_day)
306
307 return new_disk_days
308
309 @staticmethod
310 def __get_diff_attrs(disk_days):
311 """
312 Get 5 days differential attributes.
313
314 Args:
315 disk_days: Refer to function predict(...).
316
317 Returns:
318 attr_list: All S.M.A.R.T. attributes used in given disk. Here we
319 use intersection set of all disk days.
320
321 diff_disk_days: A list struct comprises 5 dictionaries, each
322 dictionary contains differential attributes.
323
324 Raises:
325 Exceptions of wrong list/dict operations.
326 """
327
328 all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
329 attr_list = list(set.intersection(*all_attrs))
330 attr_list = disk_days[0].keys()
331 prev_days = disk_days[:-1]
332 curr_days = disk_days[1:]
333 diff_disk_days = []
9f95a23c 334 # TODO: ensure that this ordering is correct
11fdf7f2 335 for prev, cur in zip(prev_days, curr_days):
9f95a23c
TL
336 diff_disk_days.append(
337 {attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list}
338 )
11fdf7f2
TL
339
340 return attr_list, diff_disk_days
341
342 def __get_best_models(self, attr_list):
343 """
344 Find the best model from model list according to given attribute list.
345
346 Args:
347 attr_list: All S.M.A.R.T. attributes used in given disk.
348
349 Returns:
350 modelpath: The best model for the given attribute list.
351 model_attrlist: 'Ordered' attribute list of the returned model.
352 Must be aware that SMART attributes is in order.
353
354 Raises:
355 """
356
357 models = self.model_context.keys()
358
359 scores = []
360 for model_name in models:
9f95a23c
TL
361 scores.append(
362 sum(attr in attr_list for attr in self.model_context[model_name])
363 )
11fdf7f2
TL
364 max_score = max(scores)
365
366 # Skip if too few matched attributes.
367 if max_score < 3:
368 print("Too few matched attributes")
369 return None
370
371 best_models = {}
9f95a23c
TL
372 best_model_indices = [
373 idx for idx, score in enumerate(scores) if score > max_score - 2
374 ]
11fdf7f2
TL
375 for model_idx in best_model_indices:
376 model_name = list(models)[model_idx]
377 model_path = os.path.join(self.model_dirpath, model_name)
378 model_attrlist = self.model_context[model_name]
379 best_models[model_path] = model_attrlist
380
381 return best_models
382 # return os.path.join(self.model_dirpath, model_name), model_attrlist
383
384 @staticmethod
385 def __get_ordered_attrs(disk_days, model_attrlist):
386 """
387 Return ordered attributes of given disk days.
388
389 Args:
390 disk_days: Unordered disk days.
391 model_attrlist: Model's ordered attribute list.
392
393 Returns:
394 ordered_attrs: Ordered disk days.
395
396 Raises: None
397 """
398
399 ordered_attrs = []
400
401 for one_day in disk_days:
402 one_day_attrs = []
403
404 for attr in model_attrlist:
405 if attr in one_day:
406 one_day_attrs.append(one_day[attr])
407 else:
408 one_day_attrs.append(0)
409
410 ordered_attrs.append(one_day_attrs)
411
412 return ordered_attrs
413
414 def predict(self, disk_days):
415 """
416 Predict using given 6-days disk S.M.A.R.T. attributes.
417
418 Args:
419 disk_days: A list struct comprises 6 dictionaries. These
420 dictionaries store 'consecutive' days of disk SMART
421 attributes.
422 Returns:
423 A string indicates prediction result. One of following four strings
424 will be returned according to disk failure status:
425 (1) Good : Disk is health
426 (2) Warning : Disk has some symptoms but may not fail immediately
427 (3) Bad : Disk is in danger and data backup is highly recommended
428 (4) Unknown : Not enough data for prediction.
429
430 Raises:
431 Pickle exceptions
432 """
433
434 all_pred = []
435
436 proc_disk_days = self.__preprocess(disk_days)
9f95a23c 437 attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days)
11fdf7f2
TL
438 modellist = self.__get_best_models(attr_list)
439 if modellist is None:
440 return "Unknown"
441
442 for modelpath in modellist:
443 model_attrlist = modellist[modelpath]
9f95a23c
TL
444 ordered_data = PSDiskFailurePredictor.__get_ordered_attrs(
445 diff_data, model_attrlist
446 )
11fdf7f2
TL
447
448 try:
9f95a23c 449 with open(modelpath, "rb") as f_model:
11fdf7f2
TL
450 clf = pickle.load(f_model)
451
452 except UnicodeDecodeError:
453 # Compatibility for python3
9f95a23c
TL
454 with open(modelpath, "rb") as f_model:
455 clf = pickle.load(f_model, encoding="latin1")
11fdf7f2
TL
456
457 pred = clf.predict(ordered_data)
458
459 all_pred.append(1 if any(pred) else 0)
460
461 score = 2 ** sum(all_pred) - len(modellist)
462 if score > 10:
463 return "Bad"
464 if score > 4:
465 return "Warning"
466 return "Good"