[ceph.git] / ceph / src / pybind / mgr / diskprediction_local / predictor.py

"""Machine learning model for disk failure prediction.

This classes defined here provide the disk failure prediction module.
RHDiskFailurePredictor uses the models developed at the AICoE in the
Office of the CTO at Red Hat. These models were built using the open
source Backblaze SMART metrics dataset.
PSDiskFailurePredictor uses the models developed by ProphetStor as an
example.

An instance of the predictor is initialized by providing the path to trained
models. Then, to predict hard drive health and deduce time to failure, the
predict function is called with 6 days worth of SMART data from the hard drive.
It will return a string to indicate disk failure status: "Good", "Warning",
"Bad", or "Unknown".

An example code is as follows:

>>> model = disk_failure_predictor.RHDiskFailurePredictor()
>>> status = model.initialize("./models")
>>> if status:
>>>     model.predict(disk_days)
'Bad'
"""
import os
import json
import pickle
import logging

import numpy as np
from scipy import stats


def get_diskfailurepredictor_path():
    path = os.path.abspath(__file__)
    dir_path = os.path.dirname(path)
    return dir_path


class RHDiskFailurePredictor(object):
    """Disk failure prediction module developed at Red Hat

    This class implements a disk failure prediction module.
    """

    # json with manufacturer names as keys
    # and features used for prediction as values
    CONFIG_FILE = "config.json"
    PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}

    # model name prefixes to identify vendor
    MANUFACTURER_MODELNAME_PREFIXES = {
        "WDC": "WDC",
        "Toshiba": "Toshiba",  # for cases like "Toshiba xxx"
        "TOSHIBA": "Toshiba",  # for cases like "TOSHIBA xxx"
        "toshiba": "Toshiba",  # for cases like "toshiba xxx"
        "S": "Seagate",        # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
        "ZA": "Seagate",       # for cases like "ZAxxxx"
        "Hitachi": "Hitachi",
        "HGST": "HGST",
    }

    LOGGER = logging.getLogger()

    def __init__(self):
        """
        This function may throw exception due to wrong file operation.
        """
        self.model_dirpath = ""
        self.model_context = {}

    def initialize(self, model_dirpath):
        """Initialize all models. Save paths of all trained model files to list

        Arguments:
            model_dirpath {str} -- path to directory of trained models

        Returns:
            str -- Error message. If all goes well, return None
        """
        # read config file as json, if it exists
        config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
        if not os.path.isfile(config_path):
            return "Missing config file: " + config_path
        else:
            with open(config_path) as f_conf:
                self.model_context = json.load(f_conf)

        # ensure all manufacturers whose context is defined in config file
        # have models and scalers saved inside model_dirpath
        for manufacturer in self.model_context:
            scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
            if not os.path.isfile(scaler_path):
                return "Missing scaler file: {}".format(scaler_path)
            model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
            if not os.path.isfile(model_path):
                return "Missing model file: {}".format(model_path)

        self.model_dirpath = model_dirpath

    def __preprocess(self, disk_days, manufacturer):
        """Scales and transforms input dataframe to feed it to prediction model

        Arguments:
            disk_days {list} -- list in which each element is a dictionary with key,val
                                as feature name,value respectively.
                                e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...]
            manufacturer {str} -- manufacturer of the hard drive

        Returns:
            numpy.ndarray -- (n, d) shaped array of n days worth of data and d
                                features, scaled
        """
        # get the attributes that were used to train model for current manufacturer
        try:
            model_smart_attr = self.model_context[manufacturer]
        except KeyError as e:
            RHDiskFailurePredictor.LOGGER.debug(
                "No context (SMART attributes on which model has been trained) found for manufacturer: {}".format(
                    manufacturer
                )
            )
            return None

        # convert to structured array, keeping only the required features
        # assumes all data is in float64 dtype
        try:
            struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
            values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
            disk_days_sa = np.array(values, dtype=struc_dtypes)
        except KeyError as e:
            RHDiskFailurePredictor.LOGGER.debug(
                "Mismatch in SMART attributes used to train model and SMART attributes available"
            )
            return None

        # view structured array as 2d array for applying rolling window transforms
        # do not include capacity_bytes in this. only use smart_attrs
        disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
                            .view(np.float64).reshape(disk_days_sa.shape + (-1,))

        # featurize n (6 to 12) days data - mean,std,coefficient of variation
        # current model is trained on 6 days of data because that is what will be
        # available at runtime

        # rolling time window interval size in days
        roll_window_size = 6

        # rolling means generator
        gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \
                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
        means = np.vstack(gen)

        # rolling stds generator
        gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \
                for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
        stds = np.vstack(gen)

        # coefficient of variation
        cvs = stds / means
        cvs[np.isnan(cvs)] = 0
        featurized = np.hstack((
                                means,
                                stds,
                                cvs,
                                disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1)
                                ))

        # scale features
        scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
        featurized = scaler.transform(featurized)
        return featurized

    @staticmethod
    def __get_manufacturer(model_name):
        """Returns the manufacturer name for a given hard drive model name

        Arguments:
            model_name {str} -- hard drive model name

        Returns:
            str -- manufacturer name
        """
        for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
            if model_name.startswith(prefix):
                return manufacturer
        # print error message
        RHDiskFailurePredictor.LOGGER.debug(
            "Could not infer manufacturer from model name {}".format(model_name)
        )

    def predict(self, disk_days):
        # get manufacturer preferably as a smartctl attribute
        # if not available then infer using model name
        manufacturer = disk_days[0].get("vendor")
        if manufacturer is None:
            RHDiskFailurePredictor.LOGGER.debug(
                '"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.'
            )
            manufacturer = RHDiskFailurePredictor.__get_manufacturer(
                disk_days[0].get("model_name", "")
            ).lower()

        # print error message, return Unknown, and continue execution
        if manufacturer is None:
            RHDiskFailurePredictor.LOGGER.debug(
                "Manufacturer could not be determiend. This may be because \
                DiskPredictor has never encountered this manufacturer before, \
                    or the model name is not according to the manufacturer's \
                        naming conventions known to DiskPredictor"
            )
            return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]

        # preprocess for feeding to model
        preprocessed_data = self.__preprocess(disk_days, manufacturer)
        if preprocessed_data is None:
            return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]

        # get model for current manufacturer
        model_path = os.path.join(
            self.model_dirpath, manufacturer + "_predictor.pkl"
        )
        with open(model_path, 'rb') as f:
            model = pickle.load(f)

        # use prediction for most recent day
        # TODO: ensure that most recent day is last element and most previous day
        # is first element in input disk_days
        pred_class_id = model.predict(preprocessed_data)[-1]
        return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]


class PSDiskFailurePredictor(object):
    """Disk failure prediction developed at ProphetStor

    This class implements a disk failure prediction module.
    """

    CONFIG_FILE = "config.json"
    EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"]

    def __init__(self):
        """
        This function may throw exception due to wrong file operation.
        """

        self.model_dirpath = ""
        self.model_context = {}

    def initialize(self, model_dirpath):
        """
        Initialize all models.

        Args: None

        Returns:
            Error message. If all goes well, return an empty string.

        Raises:
        """

        config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
        if not os.path.isfile(config_path):
            return "Missing config file: " + config_path
        else:
            with open(config_path) as f_conf:
                self.model_context = json.load(f_conf)

        for model_name in self.model_context:
            model_path = os.path.join(model_dirpath, model_name)

            if not os.path.isfile(model_path):
                return "Missing model file: " + model_path

        self.model_dirpath = model_dirpath

    def __preprocess(self, disk_days):
        """
        Preprocess disk attributes.

        Args:
            disk_days: Refer to function predict(...).

        Returns:
            new_disk_days: Processed disk days.
        """

        req_attrs = []
        new_disk_days = []

        attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days])
        for attr in attr_list:
            if (
                attr.startswith("smart_") and attr.endswith("_raw")
            ) and attr not in self.EXCLUDED_ATTRS:
                req_attrs.append(attr)

        for disk_day in disk_days:
            new_disk_day = {}
            for attr in req_attrs:
                if float(disk_day[attr]) >= 0.0:
                    new_disk_day[attr] = disk_day[attr]

            new_disk_days.append(new_disk_day)

        return new_disk_days

    @staticmethod
    def __get_diff_attrs(disk_days):
        """
        Get 5 days differential attributes.

        Args:
            disk_days: Refer to function predict(...).

        Returns:
            attr_list: All S.M.A.R.T. attributes used in given disk. Here we
                       use intersection set of all disk days.

            diff_disk_days: A list struct comprises 5 dictionaries, each
                            dictionary contains differential attributes.

        Raises:
            Exceptions of wrong list/dict operations.
        """

        all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
        attr_list = list(set.intersection(*all_attrs))
        attr_list = disk_days[0].keys()
        prev_days = disk_days[:-1]
        curr_days = disk_days[1:]
        diff_disk_days = []
        # TODO: ensure that this ordering is correct
        for prev, cur in zip(prev_days, curr_days):
            diff_disk_days.append(
                {attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list}
            )

        return attr_list, diff_disk_days

    def __get_best_models(self, attr_list):
        """
        Find the best model from model list according to given attribute list.

        Args:
            attr_list: All S.M.A.R.T. attributes used in given disk.

        Returns:
            modelpath: The best model for the given attribute list.
            model_attrlist: 'Ordered' attribute list of the returned model.
                            Must be aware that SMART attributes is in order.

        Raises:
        """

        models = self.model_context.keys()

        scores = []
        for model_name in models:
            scores.append(
                sum(attr in attr_list for attr in self.model_context[model_name])
            )
        max_score = max(scores)

        # Skip if too few matched attributes.
        if max_score < 3:
            print("Too few matched attributes")
            return None

        best_models = {}
        best_model_indices = [
            idx for idx, score in enumerate(scores) if score > max_score - 2
        ]
        for model_idx in best_model_indices:
            model_name = list(models)[model_idx]
            model_path = os.path.join(self.model_dirpath, model_name)
            model_attrlist = self.model_context[model_name]
            best_models[model_path] = model_attrlist

        return best_models
        # return os.path.join(self.model_dirpath, model_name), model_attrlist

    @staticmethod
    def __get_ordered_attrs(disk_days, model_attrlist):
        """
        Return ordered attributes of given disk days.

        Args:
            disk_days: Unordered disk days.
            model_attrlist: Model's ordered attribute list.

        Returns:
            ordered_attrs: Ordered disk days.

        Raises: None
        """

        ordered_attrs = []

        for one_day in disk_days:
            one_day_attrs = []

            for attr in model_attrlist:
                if attr in one_day:
                    one_day_attrs.append(one_day[attr])
                else:
                    one_day_attrs.append(0)

            ordered_attrs.append(one_day_attrs)

        return ordered_attrs

    def predict(self, disk_days):
        """
        Predict using given 6-days disk S.M.A.R.T. attributes.

        Args:
            disk_days: A list struct comprises 6 dictionaries. These
                       dictionaries store 'consecutive' days of disk SMART
                       attributes.
        Returns:
            A string indicates prediction result. One of following four strings
            will be returned according to disk failure status:
            (1) Good : Disk is health
            (2) Warning : Disk has some symptoms but may not fail immediately
            (3) Bad : Disk is in danger and data backup is highly recommended
            (4) Unknown : Not enough data for prediction.

        Raises:
            Pickle exceptions
        """

        all_pred = []

        proc_disk_days = self.__preprocess(disk_days)
        attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days)
        modellist = self.__get_best_models(attr_list)
        if modellist is None:
            return "Unknown"

        for modelpath in modellist:
            model_attrlist = modellist[modelpath]
            ordered_data = PSDiskFailurePredictor.__get_ordered_attrs(
                diff_data, model_attrlist
            )

            try:
                with open(modelpath, "rb") as f_model:
                    clf = pickle.load(f_model)

            except UnicodeDecodeError:
                # Compatibility for python3
                with open(modelpath, "rb") as f_model:
                    clf = pickle.load(f_model, encoding="latin1")

            pred = clf.predict(ordered_data)

            all_pred.append(1 if any(pred) else 0)

        score = 2 ** sum(all_pred) - len(modellist)
        if score > 10:
            return "Bad"
        if score > 4:
            return "Warning"
        return "Good"
Commit	Line	Data
9f95a23c	1	"""Machine learning model for disk failure prediction.
11fdf7f2	2
9f95a23c TL	3	This classes defined here provide the disk failure prediction module.
	4	RHDiskFailurePredictor uses the models developed at the AICoE in the
	5	Office of the CTO at Red Hat. These models were built using the open
	6	source Backblaze SMART metrics dataset.
	7	PSDiskFailurePredictor uses the models developed by ProphetStor as an
	8	example.
11fdf7f2	9
9f95a23c TL	10	An instance of the predictor is initialized by providing the path to trained
	11	models. Then, to predict hard drive health and deduce time to failure, the
	12	predict function is called with 6 days worth of SMART data from the hard drive.
	13	It will return a string to indicate disk failure status: "Good", "Warning",
	14	"Bad", or "Unknown".
11fdf7f2 TL	15
	16	An example code is as follows:
	17
9f95a23c	18	>>> model = disk_failure_predictor.RHDiskFailurePredictor()
11fdf7f2 TL	19	>>> status = model.initialize("./models")
	20	>>> if status:
	21	>>> model.predict(disk_days)
	22	'Bad'
11fdf7f2	23	"""
11fdf7f2 TL	24	import os
	25	import json
	26	import pickle
9f95a23c TL	27	import logging
	28
	29	import numpy as np
	30	from scipy import stats
11fdf7f2 TL	31
	32
	33	def get_diskfailurepredictor_path():
	34	path = os.path.abspath(__file__)
	35	dir_path = os.path.dirname(path)
	36	return dir_path
	37
	38
9f95a23c TL	39	class RHDiskFailurePredictor(object):
9f95a23c TL	40	"""Disk failure prediction module developed at Red Hat
11fdf7f2 TL	41
	42	This class implements a disk failure prediction module.
	43	"""
	44
9f95a23c TL	45	# json with manufacturer names as keys
9f95a23c TL	46	# and features used for prediction as values
11fdf7f2	47	CONFIG_FILE = "config.json"
9f95a23c TL	48	PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"}
	49
	50	# model name prefixes to identify vendor
	51	MANUFACTURER_MODELNAME_PREFIXES = {
	52	"WDC": "WDC",
	53	"Toshiba": "Toshiba", # for cases like "Toshiba xxx"
	54	"TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx"
	55	"toshiba": "Toshiba", # for cases like "toshiba xxx"
	56	"S": "Seagate", # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx"
	57	"ZA": "Seagate", # for cases like "ZAxxxx"
	58	"Hitachi": "Hitachi",
	59	"HGST": "HGST",
	60	}
	61
	62	LOGGER = logging.getLogger()
	63
	64	def __init__(self):
	65	"""
	66	This function may throw exception due to wrong file operation.
	67	"""
	68	self.model_dirpath = ""
	69	self.model_context = {}
	70
	71	def initialize(self, model_dirpath):
	72	"""Initialize all models. Save paths of all trained model files to list
	73
	74	Arguments:
	75	model_dirpath {str} -- path to directory of trained models
	76
	77	Returns:
	78	str -- Error message. If all goes well, return None
	79	"""
	80	# read config file as json, if it exists
	81	config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
	82	if not os.path.isfile(config_path):
	83	return "Missing config file: " + config_path
	84	else:
	85	with open(config_path) as f_conf:
	86	self.model_context = json.load(f_conf)
	87
	88	# ensure all manufacturers whose context is defined in config file
	89	# have models and scalers saved inside model_dirpath
	90	for manufacturer in self.model_context:
	91	scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.pkl")
	92	if not os.path.isfile(scaler_path):
	93	return "Missing scaler file: {}".format(scaler_path)
	94	model_path = os.path.join(model_dirpath, manufacturer + "_predictor.pkl")
	95	if not os.path.isfile(model_path):
	96	return "Missing model file: {}".format(model_path)
	97
	98	self.model_dirpath = model_dirpath
	99
	100	def __preprocess(self, disk_days, manufacturer):
	101	"""Scales and transforms input dataframe to feed it to prediction model
	102
	103	Arguments:
	104	disk_days {list} -- list in which each element is a dictionary with key,val
	105	as feature name,value respectively.
	106	e.g.[{'smart_1_raw': 0, 'user_capacity': 512 ...}, ...]
	107	manufacturer {str} -- manufacturer of the hard drive
	108
	109	Returns:
	110	numpy.ndarray -- (n, d) shaped array of n days worth of data and d
	111	features, scaled
112	"""
113	# get the attributes that were used to train model for current manufacturer
114	try:
115	model_smart_attr = self.model_context[manufacturer]
116	except KeyError as e:
117	RHDiskFailurePredictor.LOGGER.debug(
118	"No context (SMART attributes on which model has been trained) found for manufacturer: {}".format(
119	manufacturer
120	)
121	)
122	return None
123
124	# convert to structured array, keeping only the required features
125	# assumes all data is in float64 dtype
126	try:
127	struc_dtypes = [(attr, np.float64) for attr in model_smart_attr]
128	values = [tuple(day[attr] for attr in model_smart_attr) for day in disk_days]
129	disk_days_sa = np.array(values, dtype=struc_dtypes)
130	except KeyError as e:
131	RHDiskFailurePredictor.LOGGER.debug(
132	"Mismatch in SMART attributes used to train model and SMART attributes available"
133	)
134	return None
135
136	# view structured array as 2d array for applying rolling window transforms
137	# do not include capacity_bytes in this. only use smart_attrs
138	disk_days_attrs = disk_days_sa[[attr for attr in model_smart_attr if 'smart_' in attr]]\
139	.view(np.float64).reshape(disk_days_sa.shape + (-1,))
140
141	# featurize n (6 to 12) days data - mean,std,coefficient of variation
142	# current model is trained on 6 days of data because that is what will be
143	# available at runtime
144
145	# rolling time window interval size in days
146	roll_window_size = 6
147
148	# rolling means generator
149	gen = (disk_days_attrs[i: i + roll_window_size, ...].mean(axis=0) \
150	for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
151	means = np.vstack(gen)
152
153	# rolling stds generator
154	gen = (disk_days_attrs[i: i + roll_window_size, ...].std(axis=0, ddof=1) \
155	for i in range(0, disk_days_attrs.shape[0] - roll_window_size + 1))
156	stds = np.vstack(gen)
157
158	# coefficient of variation
159	cvs = stds / means
160	cvs[np.isnan(cvs)] = 0
161	featurized = np.hstack((
162	means,
163	stds,
164	cvs,
165	disk_days_sa['user_capacity'][: disk_days_attrs.shape[0] - roll_window_size + 1].reshape(-1, 1)
166	))
167
168	# scale features
169	scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.pkl")
170	with open(scaler_path, 'rb') as f:
171	scaler = pickle.load(f)
172	featurized = scaler.transform(featurized)
173	return featurized
174
175	@staticmethod
176	def __get_manufacturer(model_name):
177	"""Returns the manufacturer name for a given hard drive model name
178
179	Arguments:
180	model_name {str} -- hard drive model name
181
182	Returns:
183	str -- manufacturer name
184	"""
185	for prefix, manufacturer in RHDiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items():
186	if model_name.startswith(prefix):
187	return manufacturer
188	# print error message
189	RHDiskFailurePredictor.LOGGER.debug(
190	"Could not infer manufacturer from model name {}".format(model_name)
191	)
192
193	def predict(self, disk_days):
194	# get manufacturer preferably as a smartctl attribute
195	# if not available then infer using model name
196	manufacturer = disk_days[0].get("vendor")
197	if manufacturer is None:
198	RHDiskFailurePredictor.LOGGER.debug(
199	'"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.'
200	)
201	manufacturer = RHDiskFailurePredictor.__get_manufacturer(
202	disk_days[0].get("model_name", "")
203	).lower()
204
205	# print error message, return Unknown, and continue execution
206	if manufacturer is None:
207	RHDiskFailurePredictor.LOGGER.debug(
208	"Manufacturer could not be determiend. This may be because \
209	DiskPredictor has never encountered this manufacturer before, \
210	or the model name is not according to the manufacturer's \
211	naming conventions known to DiskPredictor"
212	)
213	return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
214
215	# preprocess for feeding to model
216	preprocessed_data = self.__preprocess(disk_days, manufacturer)
217	if preprocessed_data is None:
218	return RHDiskFailurePredictor.PREDICTION_CLASSES[-1]
219
220	# get model for current manufacturer
221	model_path = os.path.join(
222	self.model_dirpath, manufacturer + "_predictor.pkl"
223	)
224	with open(model_path, 'rb') as f:
225	model = pickle.load(f)
226
227	# use prediction for most recent day
228	# TODO: ensure that most recent day is last element and most previous day
229	# is first element in input disk_days
230	pred_class_id = model.predict(preprocessed_data)[-1]
231	return RHDiskFailurePredictor.PREDICTION_CLASSES[pred_class_id]
232
233
234	class PSDiskFailurePredictor(object):
235	"""Disk failure prediction developed at ProphetStor
236
237	This class implements a disk failure prediction module.
238	"""
239
240	CONFIG_FILE = "config.json"
241	EXCLUDED_ATTRS = ["smart_9_raw", "smart_241_raw", "smart_242_raw"]
11fdf7f2 TL	242
	243	def __init__(self):
	244	"""
	245	This function may throw exception due to wrong file operation.
	246	"""
	247
	248	self.model_dirpath = ""
	249	self.model_context = {}
	250
	251	def initialize(self, model_dirpath):
	252	"""
	253	Initialize all models.
	254
	255	Args: None
	256
	257	Returns:
	258	Error message. If all goes well, return an empty string.
	259
	260	Raises:
	261	"""
	262
	263	config_path = os.path.join(model_dirpath, self.CONFIG_FILE)
	264	if not os.path.isfile(config_path):
	265	return "Missing config file: " + config_path
	266	else:
	267	with open(config_path) as f_conf:
	268	self.model_context = json.load(f_conf)
	269
	270	for model_name in self.model_context:
	271	model_path = os.path.join(model_dirpath, model_name)
	272
	273	if not os.path.isfile(model_path):
	274	return "Missing model file: " + model_path
	275
	276	self.model_dirpath = model_dirpath
	277
	278	def __preprocess(self, disk_days):
	279	"""
	280	Preprocess disk attributes.
	281
	282	Args:
	283	disk_days: Refer to function predict(...).
	284
	285	Returns:
	286	new_disk_days: Processed disk days.
	287	"""
	288
	289	req_attrs = []
	290	new_disk_days = []
	291
9f95a23c	292	attr_list = set.intersection(*[set(disk_day.keys()) for disk_day in disk_days])
11fdf7f2	293	for attr in attr_list:
9f95a23c TL	294	if (
	295	attr.startswith("smart_") and attr.endswith("_raw")
	296	) and attr not in self.EXCLUDED_ATTRS:
11fdf7f2 TL	297	req_attrs.append(attr)
	298
	299	for disk_day in disk_days:
	300	new_disk_day = {}
	301	for attr in req_attrs:
	302	if float(disk_day[attr]) >= 0.0:
	303	new_disk_day[attr] = disk_day[attr]
	304
	305	new_disk_days.append(new_disk_day)
	306
	307	return new_disk_days
	308
	309	@staticmethod
	310	def __get_diff_attrs(disk_days):
	311	"""
	312	Get 5 days differential attributes.
	313
	314	Args:
	315	disk_days: Refer to function predict(...).
	316
	317	Returns:
	318	attr_list: All S.M.A.R.T. attributes used in given disk. Here we
	319	use intersection set of all disk days.
	320
	321	diff_disk_days: A list struct comprises 5 dictionaries, each
	322	dictionary contains differential attributes.
	323
	324	Raises:
	325	Exceptions of wrong list/dict operations.
	326	"""
	327
	328	all_attrs = [set(disk_day.keys()) for disk_day in disk_days]
	329	attr_list = list(set.intersection(*all_attrs))
	330	attr_list = disk_days[0].keys()
	331	prev_days = disk_days[:-1]
	332	curr_days = disk_days[1:]
	333	diff_disk_days = []
9f95a23c	334	# TODO: ensure that this ordering is correct
11fdf7f2	335	for prev, cur in zip(prev_days, curr_days):
9f95a23c TL	336	diff_disk_days.append(
	337	{attr: (int(cur[attr]) - int(prev[attr])) for attr in attr_list}
	338	)
11fdf7f2 TL	339
	340	return attr_list, diff_disk_days
	341
	342	def __get_best_models(self, attr_list):
	343	"""
	344	Find the best model from model list according to given attribute list.
	345
	346	Args:
	347	attr_list: All S.M.A.R.T. attributes used in given disk.
	348
	349	Returns:
	350	modelpath: The best model for the given attribute list.
	351	model_attrlist: 'Ordered' attribute list of the returned model.
	352	Must be aware that SMART attributes is in order.
	353
	354	Raises:
	355	"""
	356
	357	models = self.model_context.keys()
	358
	359	scores = []
	360	for model_name in models:
9f95a23c TL	361	scores.append(
	362	sum(attr in attr_list for attr in self.model_context[model_name])
	363	)
11fdf7f2 TL	364	max_score = max(scores)
	365
	366	# Skip if too few matched attributes.
	367	if max_score < 3:
	368	print("Too few matched attributes")
	369	return None
	370
	371	best_models = {}
9f95a23c TL	372	best_model_indices = [
	373	idx for idx, score in enumerate(scores) if score > max_score - 2
	374	]
11fdf7f2 TL	375	for model_idx in best_model_indices:
	376	model_name = list(models)[model_idx]
	377	model_path = os.path.join(self.model_dirpath, model_name)
	378	model_attrlist = self.model_context[model_name]
	379	best_models[model_path] = model_attrlist
	380
	381	return best_models
	382	# return os.path.join(self.model_dirpath, model_name), model_attrlist
	383
	384	@staticmethod
	385	def __get_ordered_attrs(disk_days, model_attrlist):
	386	"""
	387	Return ordered attributes of given disk days.
	388
	389	Args:
	390	disk_days: Unordered disk days.
	391	model_attrlist: Model's ordered attribute list.
	392
	393	Returns:
	394	ordered_attrs: Ordered disk days.
	395
	396	Raises: None
	397	"""
	398
	399	ordered_attrs = []
	400
	401	for one_day in disk_days:
	402	one_day_attrs = []
	403
	404	for attr in model_attrlist:
	405	if attr in one_day:
	406	one_day_attrs.append(one_day[attr])
	407	else:
	408	one_day_attrs.append(0)
	409
	410	ordered_attrs.append(one_day_attrs)
	411
	412	return ordered_attrs
	413
	414	def predict(self, disk_days):
	415	"""
	416	Predict using given 6-days disk S.M.A.R.T. attributes.
	417
	418	Args:
	419	disk_days: A list struct comprises 6 dictionaries. These
	420	dictionaries store 'consecutive' days of disk SMART
	421	attributes.
	422	Returns:
	423	A string indicates prediction result. One of following four strings
	424	will be returned according to disk failure status:
	425	(1) Good : Disk is health
	426	(2) Warning : Disk has some symptoms but may not fail immediately
	427	(3) Bad : Disk is in danger and data backup is highly recommended
	428	(4) Unknown : Not enough data for prediction.
	429
	430	Raises:
	431	Pickle exceptions
	432	"""
	433
	434	all_pred = []
	435
	436	proc_disk_days = self.__preprocess(disk_days)
9f95a23c	437	attr_list, diff_data = PSDiskFailurePredictor.__get_diff_attrs(proc_disk_days)
11fdf7f2 TL	438	modellist = self.__get_best_models(attr_list)
	439	if modellist is None:
	440	return "Unknown"
	441
	442	for modelpath in modellist:
	443	model_attrlist = modellist[modelpath]
9f95a23c TL	444	ordered_data = PSDiskFailurePredictor.__get_ordered_attrs(
	445	diff_data, model_attrlist
	446	)
11fdf7f2 TL	447
11fdf7f2 TL	448	try:
9f95a23c	449	with open(modelpath, "rb") as f_model:
11fdf7f2 TL	450	clf = pickle.load(f_model)
	451
	452	except UnicodeDecodeError:
	453	# Compatibility for python3
9f95a23c TL	454	with open(modelpath, "rb") as f_model:
9f95a23c TL	455	clf = pickle.load(f_model, encoding="latin1")
11fdf7f2 TL	456
	457	pred = clf.predict(ordered_data)
	458
	459	all_pred.append(1 if any(pred) else 0)
	460
	461	score = 2 ** sum(all_pred) - len(modellist)
	462	if score > 10:
	463	return "Bad"
	464	if score > 4:
	465	return "Warning"
	466	return "Good"