ceph/src/pybind/mgr/diskprediction_local/predictor.py

   1 """Machine learning model for disk failure prediction.
   2
   3 This classes defined here provide the disk failure prediction module.
   4 RHDiskFailurePredictor uses the models developed at the AICoE in the
   5 Office of the CTO at Red Hat. These models were built using the open
   6 source Backblaze SMART metrics dataset.
   7 PSDiskFailurePredictor uses the models developed by ProphetStor as an
   8 example.
   9
  10 An instance of the predictor is initialized by providing the path to trained
  11 models. Then, to predict hard drive health and deduce time to failure, the
  12 predict function is called with 6 days worth of SMART data from the hard drive.
  13 It will return a string to indicate disk failure status: "Good", "Warning",
  14 "Bad", or "Unknown".
  15
  16 An example code is as follows:
  17
  18 >>> model = RHDiskFailurePredictor()
  19 >>> model.initialize(get_diskfailurepredictor_path() + "/models/redhat")
  20 >>> vendor = list(RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.keys())[0]
  21 >>> disk_days = [{'vendor': vendor}]
  22 >>> model.predict(disk_days)
  23 'Unknown'
  24 """
  25 import os
  26 import json
  27 import pickle
  28 import logging
  29 from typing import Any, Dict, List, Optional, Sequence, Tuple
  30
  31 import numpy as np
  32
  33
  34 def get_diskfailurepredictor_path() -> str:
  35     path = os.path.abspath(__file__)
  36     dir_path = os.path.dirname(path)
  37     return dir_path
  38
  39
  40 DevSmartT = Dict[str, Any]
  41 AttrNamesT = List[str]
  42 AttrDiffsT = List[Dict[str, int]]
  43
  44
  45 class Predictor:
  46     @classmethod
  47     def create(cls, name: str) -> Optional['Predictor']:
  48         if name == 'prophetstor':
  49             return PSDiskFailurePredictor()
  50         elif name == 'redhat':
  51             return RHDiskFailurePredictor()
  52         else:
  53             return None
  54
  55     def initialize(self, model_dir: str) -> None:
  56         raise NotImplementedError()
  57
  58     def predict(self, dataset: Sequence[DevSmartT]) -> str:
  59         raise NotImplementedError()
  60
  61
  62 class RHDiskFailurePredictor(Predictor):
  63     """Disk failure prediction module developed at Red Hat
  64
  65     This class implements a disk failure prediction module.
  66     """
  67
  68     # json with manufacturer names as keys
  69     # and features used for prediction as values
  70     CONFIG_FILE = "config.json"
  71     PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
  72
  73     # model name prefixes to identify vendor
  74     MANUFACTURER_MODELNAME_PREFIXES = {
  75         "WDC": "WDC",
  76         "Toshiba": "Toshiba",  # for cases like "Toshiba xxx"
  77         "TOSHIBA": "Toshiba",  # for cases like "TOSHIBA xxx"
  78         "toshiba": "Toshiba",  # for cases like "toshiba xxx"
  79         "S": "Seagate",        # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
  80         "ZA": "Seagate",       # for cases like "ZAxxxx"
  81         "Hitachi": "Hitachi",
  82         "HGST": "HGST",
  83     }
  84
  85     LOGGER = logging.getLogger()
  86
  87     def __init__(self) -> None:
  88         """
  89         This function may throw exception due to wrong file operation.
  90         """
  91         self.model_dirpath = ""
  92         self.model_context: Dict[str, List[str]] = {}
  93
  94     def initialize(self, model_dirpath: str) -> None:
  95         """Initialize all models. Save paths of all trained model files to list
  96
  97         Arguments:
  98             model_dirpath {str} -- path to directory of trained models
  99
 100         Returns:
 101             str -- Error message. If all goes well, return None
 102         """
 103         # read config file as json, if it exists
 104         config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
 105         if not os.path.isfile(config_path):
 106             raise Exception("Missing config file: " + config_path)
 107         with open(config_path) as f_conf:
 108             self.model_context = json.load(f_conf)
 109
 110         # ensure all manufacturers whose context is defined in config file
 111         # have models and scalers saved inside model_dirpath
 112         for manufacturer in self.model_context:
 113             scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
 114             if not os.path.isfile(scaler_path):
 115                 raise Exception(f"Missing scaler file: {scaler_path}")
 116             model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
 117             if not os.path.isfile(model_path):
 118                 raise Exception(f"Missing model file: {model_path}")
 119
 120         self.model_dirpath = model_dirpath
 121
 122     def __preprocess(self, disk_days: Sequence[DevSmartT], manufacturer: str) -> Optional[np.ndarray]:
 123         """Scales and transforms input dataframe to feed it to prediction model
 124
 125         Arguments:
 126             disk_days {list} -- list in which each element is a dictionary with key,val
 127                                 as feature name,value respectively.
 128                                 e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...]
 129             manufacturer {str} -- manufacturer of the hard drive
 130
 131         Returns:
 132             numpy.ndarray -- (n, d) shaped array of n days worth of data and d
 133                                 features, scaled
 134         """
 135         # get the attributes that were used to train model for current manufacturer
 136         try:
 137             model_smart_attr = self.model_context[manufacturer]
 138         except KeyError:
 139             RHDiskFailurePredictor.LOGGER.debug(
 140                 "No context (SMART attributes on which model has been trained) found for manufacturer: {}".format(
 141                     manufacturer
 142                 )
 143             )
 144             return None
 145
 146         # convert to structured array, keeping only the required features
 147         # assumes all data is in float64 dtype
 148         try:
 149             struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
 150             values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
 151             disk_days_sa = np.array(values, dtype=struc_dtypes)
 152         except KeyError:
 153             RHDiskFailurePredictor.LOGGER.debug(
 154                 "Mismatch in SMART attributes used to train model and SMART attributes available"
 155             )
 156             return None
 157
 158         # view structured array as 2d array for applying rolling window transforms
 159         # do not include capacity_bytes in this. only use smart_attrs
 160         disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
 161             .view(np.float64).reshape(disk_days_sa.shape + (-1,))
 162
 163         # featurize n (6 to 12) days data - mean,std,coefficient of variation
 164         # current model is trained on 6 days of data because that is what will be
 165         # available at runtime
 166
 167         # rolling time window interval size in days
 168         roll_window_size = 6
 169
 170         # rolling means generator
 171         dataset_size = disk_days_attrs.shape[0] - roll_window_size + 1
 172         gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0)
 173                for i in range(dataset_size))
 174         means = np.vstack(gen)  # type: ignore
 175
 176         # rolling stds generator
 177         gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1)
 178                for i in range(dataset_size))
 179         stds = np.vstack(gen)  # type: ignore
 180
 181         # coefficient of variation
 182         cvs = stds / means
 183         cvs[np.isnan(cvs)] = 0
 184         featurized = np.hstack((means,
 185                                 stds,
 186                                 cvs,
 187                                 disk_days_sa['user_capacity'][: dataset_size].reshape(-1, 1)))
 188
 189         # scale features
 190         scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
 191         with open(scaler_path, 'rb') as f:
 192             scaler = pickle.load(f)
 193         featurized = scaler.transform(featurized)
 194         return featurized
 195
 196     @staticmethod
 197     def __get_manufacturer(model_name: str) -> Optional[str]:
 198         """Returns the manufacturer name for a given hard drive model name
 199
 200         Arguments:
 201             model_name {str} -- hard drive model name
 202
 203         Returns:
 204             str -- manufacturer name
 205         """
 206         for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
 207             if model_name.startswith(prefix):
 208                 return manufacturer.lower()
 209         # print error message
 210         RHDiskFailurePredictor.LOGGER.debug(
 211             f"Could not infer manufacturer from model name {model_name}")
 212         return None
 213
 214     def predict(self, disk_days: Sequence[DevSmartT]) -> str:
 215         # get manufacturer preferably as a smartctl attribute
 216         # if not available then infer using model name
 217         manufacturer = disk_days[0].get("vendor")
 218         if manufacturer is None:
 219             RHDiskFailurePredictor.LOGGER.debug(
 220                 '"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.'
 221             )
 222             manufacturer = RHDiskFailurePredictor.__get_manufacturer(
 223                 disk_days[0].get("model_name", ""))
 224
 225         # print error message, return Unknown, and continue execution
 226         if manufacturer is None:
 227             RHDiskFailurePredictor.LOGGER.debug(
 228                 "Manufacturer could not be determiend. This may be because \
 229                 DiskPredictor has never encountered this manufacturer before, \
 230                     or the model name is not according to the manufacturer's \
 231                         naming conventions known to DiskPredictor"
 232             )
 233             return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
 234
 235         # preprocess for feeding to model
 236         preprocessed_data = self.__preprocess(disk_days, manufacturer)
 237         if preprocessed_data is None:
 238             return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
 239
 240         # get model for current manufacturer
 241         model_path = os.path.join(
 242             self.model_dirpath, manufacturer + "_predictor.pkl"
 243         )
 244         with open(model_path, 'rb') as f:
 245             model = pickle.load(f)
 246
 247         # use prediction for most recent day
 248         # TODO: ensure that most recent day is last element and most previous day
 249         # is first element in input disk_days
 250         pred_class_id = model.predict(preprocessed_data)[-1]
 251         return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
 252
 253
 254 class PSDiskFailurePredictor(Predictor):
 255     """Disk failure prediction developed at ProphetStor
 256
 257     This class implements a disk failure prediction module.
 258     """
 259
 260     CONFIG_FILE = "config.json"
 261     EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"]
 262
 263     def __init__(self) -> None:
 264         """
 265         This function may throw exception due to wrong file operation.
 266         """
 267
 268         self.model_dirpath = ""
 269         self.model_context: Dict[str, List[str]] = {}
 270
 271     def initialize(self, model_dirpath: str) -> None:
 272         """
 273         Initialize all models.
 274
 275         Args: None
 276
 277         Returns:
 278             Error message. If all goes well, return an empty string.
 279
 280         Raises:
 281         """
 282
 283         config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
 284         if not os.path.isfile(config_path):
 285             raise Exception(f"Missing config file: {config_path}")
 286         with open(config_path) as f_conf:
 287             self.model_context = json.load(f_conf)
 288
 289         for model_name in self.model_context:
 290             model_path = os.path.join(model_dirpath, model_name)
 291
 292             if not os.path.isfile(model_path):
 293                 raise Exception(f"Missing model file: {model_path}")
 294
 295         self.model_dirpath = model_dirpath
 296
 297     def __preprocess(self, disk_days: Sequence[DevSmartT]) -> Sequence[DevSmartT]:
 298         """
 299         Preprocess disk attributes.
 300
 301         Args:
 302             disk_days: Refer to function predict(...).
 303
 304         Returns:
 305             new_disk_days: Processed disk days.
 306         """
 307
 308         req_attrs = []
 309         new_disk_days = []
 310
 311         attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days])
 312         for attr in attr_list:
 313             if (
 314                 attr.startswith("smart_") and attr.endswith("_raw")
 315             ) and attr not in self.EXCLUDED_ATTRS:
 316                 req_attrs.append(attr)
 317
 318         for disk_day in disk_days:
 319             new_disk_day = {}
 320             for attr in req_attrs:
 321                 if float(disk_day[attr]) >= 0.0:
 322                     new_disk_day[attr] = disk_day[attr]
 323
 324             new_disk_days.append(new_disk_day)
 325
 326         return new_disk_days
 327
 328     @staticmethod
 329     def __get_diff_attrs(disk_days: Sequence[DevSmartT]) -> Tuple[AttrNamesT, AttrDiffsT]:
 330         """
 331         Get 5 days differential attributes.
 332
 333         Args:
 334             disk_days: Refer to function predict(...).
 335
 336         Returns:
 337             attr_list: All S.M.A.R.T. attributes used in given disk. Here we
 338                        use intersection set of all disk days.
 339
 340             diff_disk_days: A list struct comprises 5 dictionaries, each
 341                             dictionary contains differential attributes.
 342
 343         Raises:
 344             Exceptions of wrong list/dict operations.
 345         """
 346
 347         all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
 348         attr_list = list(set.intersection(*all_attrs))
 349         prev_days = disk_days[:-1]
 350         curr_days = disk_days[1:]
 351         diff_disk_days = []
 352         # TODO: ensure that this ordering is correct
 353         for prev, cur in zip(prev_days, curr_days):
 354             diff_disk_days.append(
 355                 {attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list}
 356             )
 357
 358         return attr_list, diff_disk_days
 359
 360     def __get_best_models(self, attr_list: AttrNamesT) -> Optional[Dict[str, List[str]]]:
 361         """
 362         Find the best model from model list according to given attribute list.
 363
 364         Args:
 365             attr_list: All S.M.A.R.T. attributes used in given disk.
 366
 367         Returns:
 368             modelpath: The best model for the given attribute list.
 369             model_attrlist: 'Ordered' attribute list of the returned model.
 370                             Must be aware that SMART attributes is in order.
 371
 372         Raises:
 373         """
 374
 375         models = self.model_context.keys()
 376
 377         scores = []
 378         for model_name in models:
 379             scores.append(
 380                 sum(attr in attr_list for attr in self.model_context[model_name])
 381             )
 382         max_score = max(scores)
 383
 384         # Skip if too few matched attributes.
 385         if max_score < 3:
 386             print("Too few matched attributes")
 387             return None
 388
 389         best_models: Dict[str, List[str]] = {}
 390         best_model_indices = [
 391             idx for idx, score in enumerate(scores) if score > max_score - 2
 392         ]
 393         for model_idx in best_model_indices:
 394             model_name = list(models)[model_idx]
 395             model_path = os.path.join(self.model_dirpath, model_name)
 396             model_attrlist = self.model_context[model_name]
 397             best_models[model_path] = model_attrlist
 398
 399         return best_models
 400         # return os.path.join(self.model_dirpath, model_name), model_attrlist
 401
 402     @staticmethod
 403     def __get_ordered_attrs(disk_days: Sequence[DevSmartT], model_attrlist: List[str]) -> List[List[float]]:
 404         """
 405         Return ordered attributes of given disk days.
 406
 407         Args:
 408             disk_days: Unordered disk days.
 409             model_attrlist: Model's ordered attribute list.
 410
 411         Returns:
 412             ordered_attrs: Ordered disk days.
 413
 414         Raises: None
 415         """
 416
 417         ordered_attrs = []
 418
 419         for one_day in disk_days:
 420             one_day_attrs = []
 421
 422             for attr in model_attrlist:
 423                 if attr in one_day:
 424                     one_day_attrs.append(one_day[attr])
 425                 else:
 426                     one_day_attrs.append(0)
 427
 428             ordered_attrs.append(one_day_attrs)
 429
 430         return ordered_attrs
 431
 432     def predict(self, disk_days: Sequence[DevSmartT]) -> str:
 433         """
 434         Predict using given 6-days disk S.M.A.R.T. attributes.
 435
 436         Args:
 437             disk_days: A list struct comprises 6 dictionaries. These
 438                        dictionaries store 'consecutive' days of disk SMART
 439                        attributes.
 440         Returns:
 441             A string indicates prediction result. One of following four strings
 442             will be returned according to disk failure status:
 443             (1) Good : Disk is health
 444             (2) Warning : Disk has some symptoms but may not fail immediately
 445             (3) Bad : Disk is in danger and data backup is highly recommended
 446             (4) Unknown : Not enough data for prediction.
 447
 448         Raises:
 449             Pickle exceptions
 450         """
 451
 452         all_pred = []
 453
 454         proc_disk_days = self.__preprocess(disk_days)
 455         attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days)
 456         modellist = self.__get_best_models(attr_list)
 457         if modellist is None:
 458             return "Unknown"
 459
 460         for modelpath in modellist:
 461             model_attrlist = modellist[modelpath]
 462             ordered_data = PSDiskFailurePredictor.__get_ordered_attrs(
 463                 diff_data, model_attrlist
 464             )
 465
 466             try:
 467                 with open(modelpath, "rb") as f_model:
 468                     clf = pickle.load(f_model)
 469
 470             except UnicodeDecodeError:
 471                 # Compatibility for python3
 472                 with open(modelpath, "rb") as f_model:
 473                     clf = pickle.load(f_model, encoding="latin1")
 474
 475             pred = clf.predict(ordered_data)
 476
 477             all_pred.append(1 if any(pred) else 0)
 478
 479         score = 2 ** sum(all_pred) - len(modellist)
 480         if score > 10:
 481             return "Bad"
 482         if score > 4:
 483             return "Warning"
 484         return "Good"