]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/diskprediction_local/predictor.py
import quincy beta 17.1.0
[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / predictor.py
1 """Machine learning model for disk failure prediction.
2
3 This classes defined here provide the disk failure prediction module.
4 RHDiskFailurePredictor uses the models developed at the AICoE in the
5 Office of the CTO at Red Hat. These models were built using the open
6 source Backblaze SMART metrics dataset.
7 PSDiskFailurePredictor uses the models developed by ProphetStor as an
8 example.
9
10 An instance of the predictor is initialized by providing the path to trained
11 models. Then, to predict hard drive health and deduce time to failure, the
12 predict function is called with 6 days worth of SMART data from the hard drive.
13 It will return a string to indicate disk failure status: "Good", "Warning",
14 "Bad", or "Unknown".
15
16 An example code is as follows:
17
18 >>> model = RHDiskFailurePredictor()
19 >>> model.initialize(get_diskfailurepredictor_path() + "/models/redhat")
20 >>> vendor = list(RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.keys())[0]
21 >>> disk_days = [{'vendor': vendor}]
22 >>> model.predict(disk_days)
23 'Unknown'
24 """
25 import os
26 import json
27 import pickle
28 import logging
29 from typing import Any, Dict, List, Optional, Sequence, Tuple
30
31 import numpy as np
32
33
34 def get_diskfailurepredictor_path() -> str:
35 path = os.path.abspath(__file__)
36 dir_path = os.path.dirname(path)
37 return dir_path
38
39
40 DevSmartT = Dict[str, Any]
41 AttrNamesT = List[str]
42 AttrDiffsT = List[Dict[str, int]]
43
44
45 class Predictor:
46 @classmethod
47 def create(cls, name: str) -> Optional['Predictor']:
48 if name == 'prophetstor':
49 return PSDiskFailurePredictor()
50 elif name == 'redhat':
51 return RHDiskFailurePredictor()
52 else:
53 return None
54
55 def initialize(self, model_dir: str) -> None:
56 raise NotImplementedError()
57
58 def predict(self, dataset: Sequence[DevSmartT]) -> str:
59 raise NotImplementedError()
60
61
62 class RHDiskFailurePredictor(Predictor):
63 """Disk failure prediction module developed at Red Hat
64
65 This class implements a disk failure prediction module.
66 """
67
68 # json with manufacturer names as keys
69 # and features used for prediction as values
70 CONFIG_FILE = "config.json"
71 PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
72
73 # model name prefixes to identify vendor
74 MANUFACTURER_MODELNAME_PREFIXES = {
75 "WDC": "WDC",
76 "Toshiba": "Toshiba", # for cases like "Toshiba xxx"
77 "TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx"
78 "toshiba": "Toshiba", # for cases like "toshiba xxx"
79 "S": "Seagate", # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
80 "ZA": "Seagate", # for cases like "ZAxxxx"
81 "Hitachi": "Hitachi",
82 "HGST": "HGST",
83 }
84
85 LOGGER = logging.getLogger()
86
87 def __init__(self) -> None:
88 """
89 This function may throw exception due to wrong file operation.
90 """
91 self.model_dirpath = ""
92 self.model_context: Dict[str, List[str]] = {}
93
94 def initialize(self, model_dirpath: str) -> None:
95 """Initialize all models. Save paths of all trained model files to list
96
97 Arguments:
98 model_dirpath {str} -- path to directory of trained models
99
100 Returns:
101 str -- Error message. If all goes well, return None
102 """
103 # read config file as json, if it exists
104 config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
105 if not os.path.isfile(config_path):
106 raise Exception("Missing config file: " + config_path)
107 with open(config_path) as f_conf:
108 self.model_context = json.load(f_conf)
109
110 # ensure all manufacturers whose context is defined in config file
111 # have models and scalers saved inside model_dirpath
112 for manufacturer in self.model_context:
113 scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
114 if not os.path.isfile(scaler_path):
115 raise Exception(f"Missing scaler file: {scaler_path}")
116 model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
117 if not os.path.isfile(model_path):
118 raise Exception(f"Missing model file: {model_path}")
119
120 self.model_dirpath = model_dirpath
121
122 def __preprocess(self, disk_days: Sequence[DevSmartT], manufacturer: str) -> Optional[np.ndarray]:
123 """Scales and transforms input dataframe to feed it to prediction model
124
125 Arguments:
126 disk_days {list} -- list in which each element is a dictionary with key,val
127 as feature name,value respectively.
128 e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...]
129 manufacturer {str} -- manufacturer of the hard drive
130
131 Returns:
132 numpy.ndarray -- (n, d) shaped array of n days worth of data and d
133 features, scaled
134 """
135 # get the attributes that were used to train model for current manufacturer
136 try:
137 model_smart_attr = self.model_context[manufacturer]
138 except KeyError:
139 RHDiskFailurePredictor.LOGGER.debug(
140 "No context (SMART attributes on which model has been trained) found for manufacturer: {}".format(
141 manufacturer
142 )
143 )
144 return None
145
146 # convert to structured array, keeping only the required features
147 # assumes all data is in float64 dtype
148 try:
149 struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
150 values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
151 disk_days_sa = np.array(values, dtype=struc_dtypes)
152 except KeyError:
153 RHDiskFailurePredictor.LOGGER.debug(
154 "Mismatch in SMART attributes used to train model and SMART attributes available"
155 )
156 return None
157
158 # view structured array as 2d array for applying rolling window transforms
159 # do not include capacity_bytes in this. only use smart_attrs
160 disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
161 .view(np.float64).reshape(disk_days_sa.shape + (-1,))
162
163 # featurize n (6 to 12) days data - mean,std,coefficient of variation
164 # current model is trained on 6 days of data because that is what will be
165 # available at runtime
166
167 # rolling time window interval size in days
168 roll_window_size = 6
169
170 # rolling means generator
171 dataset_size = disk_days_attrs.shape[0] - roll_window_size + 1
172 gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0)
173 for i in range(dataset_size))
174 means = np.vstack(gen) # type: ignore
175
176 # rolling stds generator
177 gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1)
178 for i in range(dataset_size))
179 stds = np.vstack(gen) # type: ignore
180
181 # coefficient of variation
182 cvs = stds / means
183 cvs[np.isnan(cvs)] = 0
184 featurized = np.hstack((means,
185 stds,
186 cvs,
187 disk_days_sa['user_capacity'][: dataset_size].reshape(-1, 1)))
188
189 # scale features
190 scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
191 with open(scaler_path, 'rb') as f:
192 scaler = pickle.load(f)
193 featurized = scaler.transform(featurized)
194 return featurized
195
196 @staticmethod
197 def __get_manufacturer(model_name: str) -> Optional[str]:
198 """Returns the manufacturer name for a given hard drive model name
199
200 Arguments:
201 model_name {str} -- hard drive model name
202
203 Returns:
204 str -- manufacturer name
205 """
206 for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
207 if model_name.startswith(prefix):
208 return manufacturer.lower()
209 # print error message
210 RHDiskFailurePredictor.LOGGER.debug(
211 f"Could not infer manufacturer from model name {model_name}")
212 return None
213
214 def predict(self, disk_days: Sequence[DevSmartT]) -> str:
215 # get manufacturer preferably as a smartctl attribute
216 # if not available then infer using model name
217 manufacturer = disk_days[0].get("vendor")
218 if manufacturer is None:
219 RHDiskFailurePredictor.LOGGER.debug(
220 '"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.'
221 )
222 manufacturer = RHDiskFailurePredictor.__get_manufacturer(
223 disk_days[0].get("model_name", ""))
224
225 # print error message, return Unknown, and continue execution
226 if manufacturer is None:
227 RHDiskFailurePredictor.LOGGER.debug(
228 "Manufacturer could not be determiend. This may be because \
229 DiskPredictor has never encountered this manufacturer before, \
230 or the model name is not according to the manufacturer's \
231 naming conventions known to DiskPredictor"
232 )
233 return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
234
235 # preprocess for feeding to model
236 preprocessed_data = self.__preprocess(disk_days, manufacturer)
237 if preprocessed_data is None:
238 return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
239
240 # get model for current manufacturer
241 model_path = os.path.join(
242 self.model_dirpath, manufacturer + "_predictor.pkl"
243 )
244 with open(model_path, 'rb') as f:
245 model = pickle.load(f)
246
247 # use prediction for most recent day
248 # TODO: ensure that most recent day is last element and most previous day
249 # is first element in input disk_days
250 pred_class_id = model.predict(preprocessed_data)[-1]
251 return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
252
253
254 class PSDiskFailurePredictor(Predictor):
255 """Disk failure prediction developed at ProphetStor
256
257 This class implements a disk failure prediction module.
258 """
259
260 CONFIG_FILE = "config.json"
261 EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"]
262
263 def __init__(self) -> None:
264 """
265 This function may throw exception due to wrong file operation.
266 """
267
268 self.model_dirpath = ""
269 self.model_context: Dict[str, List[str]] = {}
270
271 def initialize(self, model_dirpath: str) -> None:
272 """
273 Initialize all models.
274
275 Args: None
276
277 Returns:
278 Error message. If all goes well, return an empty string.
279
280 Raises:
281 """
282
283 config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
284 if not os.path.isfile(config_path):
285 raise Exception(f"Missing config file: {config_path}")
286 with open(config_path) as f_conf:
287 self.model_context = json.load(f_conf)
288
289 for model_name in self.model_context:
290 model_path = os.path.join(model_dirpath, model_name)
291
292 if not os.path.isfile(model_path):
293 raise Exception(f"Missing model file: {model_path}")
294
295 self.model_dirpath = model_dirpath
296
297 def __preprocess(self, disk_days: Sequence[DevSmartT]) -> Sequence[DevSmartT]:
298 """
299 Preprocess disk attributes.
300
301 Args:
302 disk_days: Refer to function predict(...).
303
304 Returns:
305 new_disk_days: Processed disk days.
306 """
307
308 req_attrs = []
309 new_disk_days = []
310
311 attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days])
312 for attr in attr_list:
313 if (
314 attr.startswith("smart_") and attr.endswith("_raw")
315 ) and attr not in self.EXCLUDED_ATTRS:
316 req_attrs.append(attr)
317
318 for disk_day in disk_days:
319 new_disk_day = {}
320 for attr in req_attrs:
321 if float(disk_day[attr]) >= 0.0:
322 new_disk_day[attr] = disk_day[attr]
323
324 new_disk_days.append(new_disk_day)
325
326 return new_disk_days
327
328 @staticmethod
329 def __get_diff_attrs(disk_days: Sequence[DevSmartT]) -> Tuple[AttrNamesT, AttrDiffsT]:
330 """
331 Get 5 days differential attributes.
332
333 Args:
334 disk_days: Refer to function predict(...).
335
336 Returns:
337 attr_list: All S.M.A.R.T. attributes used in given disk. Here we
338 use intersection set of all disk days.
339
340 diff_disk_days: A list struct comprises 5 dictionaries, each
341 dictionary contains differential attributes.
342
343 Raises:
344 Exceptions of wrong list/dict operations.
345 """
346
347 all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
348 attr_list = list(set.intersection(*all_attrs))
349 prev_days = disk_days[:-1]
350 curr_days = disk_days[1:]
351 diff_disk_days = []
352 # TODO: ensure that this ordering is correct
353 for prev, cur in zip(prev_days, curr_days):
354 diff_disk_days.append(
355 {attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list}
356 )
357
358 return attr_list, diff_disk_days
359
360 def __get_best_models(self, attr_list: AttrNamesT) -> Optional[Dict[str, List[str]]]:
361 """
362 Find the best model from model list according to given attribute list.
363
364 Args:
365 attr_list: All S.M.A.R.T. attributes used in given disk.
366
367 Returns:
368 modelpath: The best model for the given attribute list.
369 model_attrlist: 'Ordered' attribute list of the returned model.
370 Must be aware that SMART attributes is in order.
371
372 Raises:
373 """
374
375 models = self.model_context.keys()
376
377 scores = []
378 for model_name in models:
379 scores.append(
380 sum(attr in attr_list for attr in self.model_context[model_name])
381 )
382 max_score = max(scores)
383
384 # Skip if too few matched attributes.
385 if max_score < 3:
386 print("Too few matched attributes")
387 return None
388
389 best_models: Dict[str, List[str]] = {}
390 best_model_indices = [
391 idx for idx, score in enumerate(scores) if score > max_score - 2
392 ]
393 for model_idx in best_model_indices:
394 model_name = list(models)[model_idx]
395 model_path = os.path.join(self.model_dirpath, model_name)
396 model_attrlist = self.model_context[model_name]
397 best_models[model_path] = model_attrlist
398
399 return best_models
400 # return os.path.join(self.model_dirpath, model_name), model_attrlist
401
402 @staticmethod
403 def __get_ordered_attrs(disk_days: Sequence[DevSmartT], model_attrlist: List[str]) -> List[List[float]]:
404 """
405 Return ordered attributes of given disk days.
406
407 Args:
408 disk_days: Unordered disk days.
409 model_attrlist: Model's ordered attribute list.
410
411 Returns:
412 ordered_attrs: Ordered disk days.
413
414 Raises: None
415 """
416
417 ordered_attrs = []
418
419 for one_day in disk_days:
420 one_day_attrs = []
421
422 for attr in model_attrlist:
423 if attr in one_day:
424 one_day_attrs.append(one_day[attr])
425 else:
426 one_day_attrs.append(0)
427
428 ordered_attrs.append(one_day_attrs)
429
430 return ordered_attrs
431
432 def predict(self, disk_days: Sequence[DevSmartT]) -> str:
433 """
434 Predict using given 6-days disk S.M.A.R.T. attributes.
435
436 Args:
437 disk_days: A list struct comprises 6 dictionaries. These
438 dictionaries store 'consecutive' days of disk SMART
439 attributes.
440 Returns:
441 A string indicates prediction result. One of following four strings
442 will be returned according to disk failure status:
443 (1) Good : Disk is health
444 (2) Warning : Disk has some symptoms but may not fail immediately
445 (3) Bad : Disk is in danger and data backup is highly recommended
446 (4) Unknown : Not enough data for prediction.
447
448 Raises:
449 Pickle exceptions
450 """
451
452 all_pred = []
453
454 proc_disk_days = self.__preprocess(disk_days)
455 attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days)
456 modellist = self.__get_best_models(attr_list)
457 if modellist is None:
458 return "Unknown"
459
460 for modelpath in modellist:
461 model_attrlist = modellist[modelpath]
462 ordered_data = PSDiskFailurePredictor.__get_ordered_attrs(
463 diff_data, model_attrlist
464 )
465
466 try:
467 with open(modelpath, "rb") as f_model:
468 clf = pickle.load(f_model)
469
470 except UnicodeDecodeError:
471 # Compatibility for python3
472 with open(modelpath, "rb") as f_model:
473 clf = pickle.load(f_model, encoding="latin1")
474
475 pred = clf.predict(ordered_data)
476
477 all_pred.append(1 if any(pred) else 0)
478
479 score = 2 ** sum(all_pred) - len(modellist)
480 if score > 10:
481 return "Bad"
482 if score > 4:
483 return "Warning"
484 return "Good"