]>
Commit | Line | Data |
---|---|---|
9f95a23c | 1 | """Machine learning model for disk failure prediction. |
11fdf7f2 | 2 | |
9f95a23c TL |
3 | This classes defined here provide the disk failure prediction module. |
4 | RHDiskFailurePredictor uses the models developed at the AICoE in the | |
5 | Office of the CTO at Red Hat. These models were built using the open | |
6 | source Backblaze SMART metrics dataset. | |
7 | PSDiskFailurePredictor uses the models developed by ProphetStor as an | |
8 | example. | |
11fdf7f2 | 9 | |
9f95a23c TL |
10 | An instance of the predictor is initialized by providing the path to trained |
11 | models. Then, to predict hard drive health and deduce time to failure, the | |
12 | predict function is called with 6 days worth of SMART data from the hard drive. | |
13 | It will return a string to indicate disk failure status: "Good", "Warning", | |
14 | "Bad", or "Unknown". | |
11fdf7f2 TL |
15 | |
16 | An example code is as follows: | |
17 | ||
9f95a23c | 18 | >>> model = disk_failure_predictor.RHDiskFailurePredictor() |
11fdf7f2 TL |
19 | >>> status = model.initialize("./models") |
20 | >>> if status: | |
21 | >>> model.predict(disk_days) | |
22 | 'Bad' | |
11fdf7f2 | 23 | """ |
11fdf7f2 TL |
24 | import os |
25 | import json | |
26 | import pickle | |
9f95a23c TL |
27 | import logging |
28 | ||
29 | import numpy as np | |
30 | from scipy import stats | |
11fdf7f2 TL |
31 | |
32 | ||
33 | def get_diskfailurepredictor_path(): | |
34 | path = os.path.abspath(__file__) | |
35 | dir_path = os.path.dirname(path) | |
36 | return dir_path | |
37 | ||
38 | ||
9f95a23c TL |
39 | class RHDiskFailurePredictor(object): |
40 | """Disk failure prediction module developed at Red Hat | |
11fdf7f2 TL |
41 | |
42 | This class implements a disk failure prediction module. | |
43 | """ | |
44 | ||
9f95a23c TL |
45 | # json with manufacturer names as keys |
46 | # and features used for prediction as values | |
11fdf7f2 | 47 | CONFIG_FILE = "config.json" |
9f95a23c TL |
48 | PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"} |
49 | ||
50 | # model name prefixes to identify vendor | |
51 | MANUFACTURER_MODELNAME_PREFIXES = { | |
52 | "WDC": "WDC", | |
53 | "Toshiba": "Toshiba", # for cases like "Toshiba xxx" | |
54 | "TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx" | |
55 | "toshiba": "Toshiba", # for cases like "toshiba xxx" | |
56 | "S": "Seagate", # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx" | |
57 | "ZA": "Seagate", # for cases like "ZAxxxx" | |
58 | "Hitachi": "Hitachi", | |
59 | "HGST": "HGST", | |
60 | } | |
61 | ||
62 | LOGGER = logging.getLogger() | |
63 | ||
64 | def __init__(self): | |
65 | """ | |
66 | This function may throw exception due to wrong file operation. | |
67 | """ | |
68 | self.model_dirpath = "" | |
69 | self.model_context = {} | |
70 | ||
71 | def initialize(self, model_dirpath): | |
72 | """Initialize all models. Save paths of all trained model files to list | |
73 | ||
74 | Arguments: | |
75 | model_dirpath {str} -- path to directory of trained models | |
76 | ||
77 | Returns: | |
78 | str -- Error message. If all goes well, return None | |
79 | """ | |
80 | # read config file as json, if it exists | |
81 | config_path = os.path.join(model_dirpath, self.CONFIG_FILE) | |
82 | if not os.path.isfile(config_path): | |
83 | return "Missing config file: " + config_path | |
84 | else: | |
85 | with open(config_path) as f_conf: | |
86 | self.model_context = json.load(f_conf) | |
87 | ||
88 | # ensure all manufacturers whose context is defined in config file | |
89 | # have models and scalers saved inside model_dirpath | |
90 | for manufacturer in self.model_context: | |
91 | scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl") | |
92 | if not os.path.isfile(scaler_path): | |
93 | return "Missing scaler file: {}".format(scaler_path) | |
94 | model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl") | |
95 | if not os.path.isfile(model_path): | |
96 | return "Missing model file: {}".format(model_path) | |
97 | ||
98 | self.model_dirpath = model_dirpath | |
99 | ||
100 | def __preprocess(self, disk_days, manufacturer): | |
101 | """Scales and transforms input dataframe to feed it to prediction model | |
102 | ||
103 | Arguments: | |
104 | disk_days {list} -- list in which each element is a dictionary with key,val | |
105 | as feature name,value respectively. | |
106 | e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...] | |
107 | manufacturer {str} -- manufacturer of the hard drive | |
108 | ||
109 | Returns: | |
110 | numpy.ndarray -- (n, d) shaped array of n days worth of data and d | |
111 | features, scaled | |
112 | """ | |
113 | # get the attributes that were used to train model for current manufacturer | |
114 | try: | |
115 | model_smart_attr = self.model_context[manufacturer] | |
116 | except KeyError as e: | |
117 | RHDiskFailurePredictor.LOGGER.debug( | |
118 | "No context (SMART attributes on which model has been trained) found for manufacturer: {}".format( | |
119 | manufacturer | |
120 | ) | |
121 | ) | |
122 | return None | |
123 | ||
124 | # convert to structured array, keeping only the required features | |
125 | # assumes all data is in float64 dtype | |
126 | try: | |
127 | struc_dtypes = [(attr, np.float64) for attr in model_smart_attr] | |
128 | values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days] | |
129 | disk_days_sa = np.array(values, dtype=struc_dtypes) | |
130 | except KeyError as e: | |
131 | RHDiskFailurePredictor.LOGGER.debug( | |
132 | "Mismatch in SMART attributes used to train model and SMART attributes available" | |
133 | ) | |
134 | return None | |
135 | ||
136 | # view structured array as 2d array for applying rolling window transforms | |
137 | # do not include capacity_bytes in this. only use smart_attrs | |
138 | disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\ | |
139 | .view(np.float64).reshape(disk_days_sa.shape + (-1,)) | |
140 | ||
141 | # featurize n (6 to 12) days data - mean,std,coefficient of variation | |
142 | # current model is trained on 6 days of data because that is what will be | |
143 | # available at runtime | |
144 | ||
145 | # rolling time window interval size in days | |
146 | roll_window_size = 6 | |
147 | ||
148 | # rolling means generator | |
149 | gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \ | |
150 | for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1)) | |
151 | means = np.vstack(gen) | |
152 | ||
153 | # rolling stds generator | |
154 | gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \ | |
155 | for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1)) | |
156 | stds = np.vstack(gen) | |
157 | ||
158 | # coefficient of variation | |
159 | cvs = stds / means | |
160 | cvs[np.isnan(cvs)] = 0 | |
161 | featurized = np.hstack(( | |
162 | means, | |
163 | stds, | |
164 | cvs, | |
165 | disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1) | |
166 | )) | |
167 | ||
168 | # scale features | |
169 | scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl") | |
170 | with open(scaler_path, 'rb') as f: | |
171 | scaler = pickle.load(f) | |
172 | featurized = scaler.transform(featurized) | |
173 | return featurized | |
174 | ||
175 | @staticmethod | |
176 | def __get_manufacturer(model_name): | |
177 | """Returns the manufacturer name for a given hard drive model name | |
178 | ||
179 | Arguments: | |
180 | model_name {str} -- hard drive model name | |
181 | ||
182 | Returns: | |
183 | str -- manufacturer name | |
184 | """ | |
185 | for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items(): | |
186 | if model_name.startswith(prefix): | |
187 | return manufacturer | |
188 | # print error message | |
189 | RHDiskFailurePredictor.LOGGER.debug( | |
190 | "Could not infer manufacturer from model name {}".format(model_name) | |
191 | ) | |
192 | ||
193 | def predict(self, disk_days): | |
194 | # get manufacturer preferably as a smartctl attribute | |
195 | # if not available then infer using model name | |
196 | manufacturer = disk_days[0].get("vendor") | |
197 | if manufacturer is None: | |
198 | RHDiskFailurePredictor.LOGGER.debug( | |
199 | '"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.' | |
200 | ) | |
201 | manufacturer = RHDiskFailurePredictor.__get_manufacturer( | |
202 | disk_days[0].get("model_name", "") | |
203 | ).lower() | |
204 | ||
205 | # print error message, return Unknown, and continue execution | |
206 | if manufacturer is None: | |
207 | RHDiskFailurePredictor.LOGGER.debug( | |
208 | "Manufacturer could not be determiend. This may be because \ | |
209 | DiskPredictor has never encountered this manufacturer before, \ | |
210 | or the model name is not according to the manufacturer's \ | |
211 | naming conventions known to DiskPredictor" | |
212 | ) | |
213 | return RHDiskFailurePredictor.PREDICTION_CLASSES[-1] | |
214 | ||
215 | # preprocess for feeding to model | |
216 | preprocessed_data = self.__preprocess(disk_days, manufacturer) | |
217 | if preprocessed_data is None: | |
218 | return RHDiskFailurePredictor.PREDICTION_CLASSES[-1] | |
219 | ||
220 | # get model for current manufacturer | |
221 | model_path = os.path.join( | |
222 | self.model_dirpath, manufacturer + "_predictor.pkl" | |
223 | ) | |
224 | with open(model_path, 'rb') as f: | |
225 | model = pickle.load(f) | |
226 | ||
227 | # use prediction for most recent day | |
228 | # TODO: ensure that most recent day is last element and most previous day | |
229 | # is first element in input disk_days | |
230 | pred_class_id = model.predict(preprocessed_data)[-1] | |
231 | return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id] | |
232 | ||
233 | ||
234 | class PSDiskFailurePredictor(object): | |
235 | """Disk failure prediction developed at ProphetStor | |
236 | ||
237 | This class implements a disk failure prediction module. | |
238 | """ | |
239 | ||
240 | CONFIG_FILE = "config.json" | |
241 | EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"] | |
11fdf7f2 TL |
242 | |
243 | def __init__(self): | |
244 | """ | |
245 | This function may throw exception due to wrong file operation. | |
246 | """ | |
247 | ||
248 | self.model_dirpath = "" | |
249 | self.model_context = {} | |
250 | ||
251 | def initialize(self, model_dirpath): | |
252 | """ | |
253 | Initialize all models. | |
254 | ||
255 | Args: None | |
256 | ||
257 | Returns: | |
258 | Error message. If all goes well, return an empty string. | |
259 | ||
260 | Raises: | |
261 | """ | |
262 | ||
263 | config_path = os.path.join(model_dirpath, self.CONFIG_FILE) | |
264 | if not os.path.isfile(config_path): | |
265 | return "Missing config file: " + config_path | |
266 | else: | |
267 | with open(config_path) as f_conf: | |
268 | self.model_context = json.load(f_conf) | |
269 | ||
270 | for model_name in self.model_context: | |
271 | model_path = os.path.join(model_dirpath, model_name) | |
272 | ||
273 | if not os.path.isfile(model_path): | |
274 | return "Missing model file: " + model_path | |
275 | ||
276 | self.model_dirpath = model_dirpath | |
277 | ||
278 | def __preprocess(self, disk_days): | |
279 | """ | |
280 | Preprocess disk attributes. | |
281 | ||
282 | Args: | |
283 | disk_days: Refer to function predict(...). | |
284 | ||
285 | Returns: | |
286 | new_disk_days: Processed disk days. | |
287 | """ | |
288 | ||
289 | req_attrs = [] | |
290 | new_disk_days = [] | |
291 | ||
9f95a23c | 292 | attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days]) |
11fdf7f2 | 293 | for attr in attr_list: |
9f95a23c TL |
294 | if ( |
295 | attr.startswith("smart_") and attr.endswith("_raw") | |
296 | ) and attr not in self.EXCLUDED_ATTRS: | |
11fdf7f2 TL |
297 | req_attrs.append(attr) |
298 | ||
299 | for disk_day in disk_days: | |
300 | new_disk_day = {} | |
301 | for attr in req_attrs: | |
302 | if float(disk_day[attr]) >= 0.0: | |
303 | new_disk_day[attr] = disk_day[attr] | |
304 | ||
305 | new_disk_days.append(new_disk_day) | |
306 | ||
307 | return new_disk_days | |
308 | ||
309 | @staticmethod | |
310 | def __get_diff_attrs(disk_days): | |
311 | """ | |
312 | Get 5 days differential attributes. | |
313 | ||
314 | Args: | |
315 | disk_days: Refer to function predict(...). | |
316 | ||
317 | Returns: | |
318 | attr_list: All S.M.A.R.T. attributes used in given disk. Here we | |
319 | use intersection set of all disk days. | |
320 | ||
321 | diff_disk_days: A list struct comprises 5 dictionaries, each | |
322 | dictionary contains differential attributes. | |
323 | ||
324 | Raises: | |
325 | Exceptions of wrong list/dict operations. | |
326 | """ | |
327 | ||
328 | all_attrs = [set(disk_day.keys()) for disk_day in disk_days] | |
329 | attr_list = list(set.intersection(*all_attrs)) | |
330 | attr_list = disk_days[0].keys() | |
331 | prev_days = disk_days[:-1] | |
332 | curr_days = disk_days[1:] | |
333 | diff_disk_days = [] | |
9f95a23c | 334 | # TODO: ensure that this ordering is correct |
11fdf7f2 | 335 | for prev, cur in zip(prev_days, curr_days): |
9f95a23c TL |
336 | diff_disk_days.append( |
337 | {attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list} | |
338 | ) | |
11fdf7f2 TL |
339 | |
340 | return attr_list, diff_disk_days | |
341 | ||
342 | def __get_best_models(self, attr_list): | |
343 | """ | |
344 | Find the best model from model list according to given attribute list. | |
345 | ||
346 | Args: | |
347 | attr_list: All S.M.A.R.T. attributes used in given disk. | |
348 | ||
349 | Returns: | |
350 | modelpath: The best model for the given attribute list. | |
351 | model_attrlist: 'Ordered' attribute list of the returned model. | |
352 | Must be aware that SMART attributes is in order. | |
353 | ||
354 | Raises: | |
355 | """ | |
356 | ||
357 | models = self.model_context.keys() | |
358 | ||
359 | scores = [] | |
360 | for model_name in models: | |
9f95a23c TL |
361 | scores.append( |
362 | sum(attr in attr_list for attr in self.model_context[model_name]) | |
363 | ) | |
11fdf7f2 TL |
364 | max_score = max(scores) |
365 | ||
366 | # Skip if too few matched attributes. | |
367 | if max_score < 3: | |
368 | print("Too few matched attributes") | |
369 | return None | |
370 | ||
371 | best_models = {} | |
9f95a23c TL |
372 | best_model_indices = [ |
373 | idx for idx, score in enumerate(scores) if score > max_score - 2 | |
374 | ] | |
11fdf7f2 TL |
375 | for model_idx in best_model_indices: |
376 | model_name = list(models)[model_idx] | |
377 | model_path = os.path.join(self.model_dirpath, model_name) | |
378 | model_attrlist = self.model_context[model_name] | |
379 | best_models[model_path] = model_attrlist | |
380 | ||
381 | return best_models | |
382 | # return os.path.join(self.model_dirpath, model_name), model_attrlist | |
383 | ||
384 | @staticmethod | |
385 | def __get_ordered_attrs(disk_days, model_attrlist): | |
386 | """ | |
387 | Return ordered attributes of given disk days. | |
388 | ||
389 | Args: | |
390 | disk_days: Unordered disk days. | |
391 | model_attrlist: Model's ordered attribute list. | |
392 | ||
393 | Returns: | |
394 | ordered_attrs: Ordered disk days. | |
395 | ||
396 | Raises: None | |
397 | """ | |
398 | ||
399 | ordered_attrs = [] | |
400 | ||
401 | for one_day in disk_days: | |
402 | one_day_attrs = [] | |
403 | ||
404 | for attr in model_attrlist: | |
405 | if attr in one_day: | |
406 | one_day_attrs.append(one_day[attr]) | |
407 | else: | |
408 | one_day_attrs.append(0) | |
409 | ||
410 | ordered_attrs.append(one_day_attrs) | |
411 | ||
412 | return ordered_attrs | |
413 | ||
414 | def predict(self, disk_days): | |
415 | """ | |
416 | Predict using given 6-days disk S.M.A.R.T. attributes. | |
417 | ||
418 | Args: | |
419 | disk_days: A list struct comprises 6 dictionaries. These | |
420 | dictionaries store 'consecutive' days of disk SMART | |
421 | attributes. | |
422 | Returns: | |
423 | A string indicates prediction result. One of following four strings | |
424 | will be returned according to disk failure status: | |
425 | (1) Good : Disk is health | |
426 | (2) Warning : Disk has some symptoms but may not fail immediately | |
427 | (3) Bad : Disk is in danger and data backup is highly recommended | |
428 | (4) Unknown : Not enough data for prediction. | |
429 | ||
430 | Raises: | |
431 | Pickle exceptions | |
432 | """ | |
433 | ||
434 | all_pred = [] | |
435 | ||
436 | proc_disk_days = self.__preprocess(disk_days) | |
9f95a23c | 437 | attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days) |
11fdf7f2 TL |
438 | modellist = self.__get_best_models(attr_list) |
439 | if modellist is None: | |
440 | return "Unknown" | |
441 | ||
442 | for modelpath in modellist: | |
443 | model_attrlist = modellist[modelpath] | |
9f95a23c TL |
444 | ordered_data = PSDiskFailurePredictor.__get_ordered_attrs( |
445 | diff_data, model_attrlist | |
446 | ) | |
11fdf7f2 TL |
447 | |
448 | try: | |
9f95a23c | 449 | with open(modelpath, "rb") as f_model: |
11fdf7f2 TL |
450 | clf = pickle.load(f_model) |
451 | ||
452 | except UnicodeDecodeError: | |
453 | # Compatibility for python3 | |
9f95a23c TL |
454 | with open(modelpath, "rb") as f_model: |
455 | clf = pickle.load(f_model, encoding="latin1") | |
11fdf7f2 TL |
456 | |
457 | pred = clf.predict(ordered_data) | |
458 | ||
459 | all_pred.append(1 if any(pred) else 0) | |
460 | ||
461 | score = 2 ** sum(all_pred) - len(modellist) | |
462 | if score > 10: | |
463 | return "Bad" | |
464 | if score > 4: | |
465 | return "Warning" | |
466 | return "Good" |