# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""Failure Probability Model.

This model predicts imminent failures for assets using IoT sensor data and past failure history data. The goal of this model is to characterize the probability that a given asset failing within a given future prediction window. The temporal granularity of the prediction window can be tuned as required, for example, 15 days or 30 days.

This model also does failure contribution breakdown to help root cause analysis. Along with predicted failure probability, the path on the failure contribution tree is also generated to indicate the major factors leading to the failure event.
"""

import json
import logging
import re
from cmath import log

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import (Day, Hour, Minute, MonthBegin, MonthEnd,
                                    Second, Week)
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostClassifier, ExtraTreesClassifier,
                              RandomForestClassifier)
from sklearn.feature_selection import (SelectFpr, SelectFromModel, SelectKBest,
                                       SelectPercentile, chi2,
                                       mutual_info_classif)
from sklearn.metrics import f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from srom.failure_prediction.preprocessing import merge_feature_and_failure
from srom.pipeline.srom_pipeline import SROMPipeline
from srom.utils.no_op import NoOp

from .estimator import SromEstimator
from .pipeline import AssetGroupPipeline
from .transformer import SimpleSummaryStatistics, _BaseTransformer
from .util import log_df_info


class FailurePredictionAssetGroupPipeline(AssetGroupPipeline):
    """Failure probability model pipeline.

    This `pmlib.pipeline.AssetGroupPipeline` sub-class implements the failure prediction model.

    Trained failure prediction model generates a failure probability and a root cause analysis tree 
    path for each incoming event. You can choose the names of these outputs by key `predictions` of 
    the dict parameter `model_pipeline` (`predictions` here is always a length-two array, of which the 
    first element is the failure probability output name, and the second element is the root cause 
    analysis tree path output name, and the order matters).

    As for the inputs to the model, you can use `features` of the dict parameter `model_pipeline` to 
    specify the attributes from assets and/or IoT devices. As for the `features_for_training` of 
    the dict parameter `model_pipeline`, it is required to be a length-one array containing a 
    datetime attribute representing the asset failure history.

    Here's a typical example of how to create an object of this class for Binary Classification:

    ```
    FailurePredictionAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
            'predictions': ['failure_probability', 'rca_path'],
            'prediction_window_size': '5d',
            'aggregation_methods': ['mean', 'max', 'min', 'median', 'std', 'sum', 'count'],
            'srom_training_options': {
                'exectype': 'single_node_complete_search'
            }
        })
    ```
    
    Here's a typical example of how to create an object of this class for Multiclass Classification:
    ```
    group = FailurePredictionAssetGroupPipeline(
            asset_group_id='ID of an asset group', 
            model_pipeline={
                "features":  ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
                
                "predictions": ["failure_probability", "rca_path"],
                #"aggregation_methods": ["mean", "max", "min", "median", "std", "sum", "count"],
                "aggregation_methods": ["mean", "max", "min"],
                "prediction_window_size": "2d",
                'failure_modes': ['PUMPS/STOPPED','PUMPS/LOWPRES'],
                'multiclass': True,
            })
    ```
    
    

    This model has three special `model_pipeline` configuration for tuning further:

    * `prediction_window_size` : `str`

        This model computes the probability of asset failure within a given future prediction 
        window. The prediction window can only be a multiple of hour or day currently and is 
        specified in the format like 6h and 10d, representing 6 hours and 10 days, repectively. 
        Note that hour ('h'), day ('d') are case insensitive.

    * `aggregation_methods` : `dict` or `list` of `str`, {'mean', 'max', 'min', 'median', 'std', 'sum', 'count'}, optional

        The actual features to this model are generated from the input data by finding rolling 
        window summary statistics over a fixed window size for the entire dataset. Since we want to 
        use the previous N days data to predict whether a failure event is going to take place 
        in the future, we use a "look-back" window of N days. We can use different aggregation 
        methods like mean, maximum, minimum, median, standard deviation, sum, count for computing 
        the rolling window summary values. Default is ['mean', 'max', 'min', 'median', 'std', 
        'sum', 'count']. If an empty list is given, then the summary statistics feature is disabled. 
        If it is given as a `dict`, use feature names as keys and the list of aggregation methods for 
        individual features as values. This way, it is possible to have generate different summary 
        statistics for different features.

    * `rolling_window_size` : `str`, optional

        The size of the rolling window for generating summary statistics features. It can only be a 
        multiple of second, minute, hour, or day, specified in the format like 5s, 10t, 6h and 10d, 
        representing 5 seconds, 10 minutes, 6 hours and 10 days, repectively. Default is the same as
        `prediction_window_size`.

    * `multiclass` : `bool`, optional

        Default is False. It must be True when yo want to use Multiclass classification model.

    * `failure_modes` : `list` of `str`, optional

        The failure modes. Default is empty list.
        

           The valid value of failure_modes is a list of CLASS/PROBLEMCODE, or a list of CLASS/PROBLEMCODE/CAUSE, or a list of CLASS/PROBLEMCODE/CAUSE/REMEDY.

           Each element in the list must be in the above format.
           Each element in the list must be from the same level.
           Valid examples: 'failure_modes': ['PUMPS/STOPPED','PUMPS/BROKEN']
                          'failure_modes': ['PUMPS/STOPPED/MOTRFAIL','PUMPS/BROKEN/PUMPJAM']
                           'failure_modes': ['PUMPS/STOPPED/MOTRFAIL/REPLACE','PUMPS/BROKEN/PUMPJAM/RESET']

          Invalid examples: 'failure_modes': ['PUMPS/STOPPED','PUMPS/BROKEN/PUMPJAM']   because the first lement is PROBLEMCODE and the second elemt is CAUSE.
                            'failure_modes': ['PUMPS/STOPPED','PUMPS/BROKEN/PUMPJAM/RESET']   because the first lement is PROBLEMCODE and the second elemt is REMEDY.



    * `data_quality_advisor` : `bool`, optional

        Default is True

    This model pipeline also addds a post-processing phase to generate one daily maximum failure 
    probability per asset and another daily mean failure probability of the whole asset group. 
    The names of these daily summary are simply the specified failure probability output name 
    prefixed with 'daily_' and 'group_daily_', repectively.

    See base class for details on all the parameters available.
    """

    def __init__(self, **kwargs):
        if self.__class__ == FailurePredictionAssetGroupPipeline:
            kwargs['model_template_name'] = 'Failure Probability'
            kwargs['model_template_desc'] = None
        super().__init__(**kwargs)

    def default_summary(self, model_config):
        """This class generates by default one daily maximum failure probability and another 
        group daily average failure probability.

        See `pmlib.pipeline.AssetGroupPipeline.default_summary`.
        """

        unit_name = None
        unit = type(to_offset(model_config['prediction_window_size']).base)
        if unit == Hour:
            unit_name = 'hourly'
        elif unit == Day:
            unit_name = 'daily'
        elif unit == Minute:
            unit_name = 'minute'
        else:
            raise ValueError('invalid model_pipeline.prediction_window_size=%s, unsupported offset alias, can only be one of H and D (case insensitive)' % model_config['prediction_window_size'])

        self.logger.debug('default_summary unit_name=%s', unit_name)

        summary = {}
        for idx, value in enumerate(model_config['predictions'][:-1]): # exclude the last output for RCA
            summary['${predictions[%s]}' % str(idx)] = {
                unit_name: {
                    'max': ('${granularity}_${data_item}', '${data_item}'),
                },
                'group_%s' % unit_name: {
                    'mean': '${granularity}_${data_item}',
                },
            }
                
        return summary

    def prepare_model_config(self, model_config):
        """This class overrides this method to set the default value to the following two custom model pipeline 
        configuration when not given in the constructor's parameter `model_pipeline`.

        * `rolling_window_size`: by default, if not given, set to be the same as `prediction_window_size`
        * `aggregation_methods`: by default, if not given, set to be `['mean', 'max', 'min', 'median', 'std', 'sum', 'count']`

        It also validates whether the `prediction_window_size` is given and is given correctly and if not 
        raises ValueError.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """

        # features validation
        if model_config.get('features', None) is None:
            raise ValueError('model_pipeline.features must be given')
        if not isinstance(model_config['features'], list) or len(model_config['features']) == 0:
            raise ValueError('model_pipeline.features must be an non-empty array')

        # features_for_training validation
        # if model_config.get('features_for_training', None) is None:
        #     raise ValueError('model_pipeline.features_for_training must be given')
        # if not isinstance(model_config['features_for_training'], list) or len(model_config['features_for_training']) == 0:
        #     raise ValueError('model_pipeline.features_for_training must be an non-empty array')

        # prediction list validation
        if model_config.get('predictions', None) is None:
            raise ValueError('model_pipeline.predictions must be given')
        if not isinstance(model_config['predictions'], list) or len(model_config['predictions']) != 2:
            raise ValueError('model_pipeline.predictions must be a 2-element array of which the first being failure probability output base name and the second the root-cause-analysis path name')

        # prediction window size config validation
        if model_config.get('prediction_window_size', None) is None:
            raise ValueError('model_pipeline.prediction_window_size must be given')

        try:
            base = to_offset(model_config['prediction_window_size'])
        except ValueError as e:
            raise ValueError('invalid model_pipeline.prediction_window_size=%s: %s' % (model_config['prediction_window_size'], e))
        if type(base) not in [Minute,Hour, Day]:
            raise ValueError('invalid model_pipeline.prediction_window_size=%s, unsupported offset alias, can only be one of H and D (case insensitive)' % model_config['prediction_window_size'])

        # special case of single unit which should always has '1' for the output name usage purpose 
        if base.n == 1 and model_config['prediction_window_size'][0] != '1':
            base = '1%s' % model_config['prediction_window_size']
        else:
            base = model_config['prediction_window_size']

        # generate different one prediction failure probability output per failure mode, based on the same 
        # given output base name
        if model_config.get('failure_modes', None) is not None and len(model_config['failure_modes']) > 0:
            # when falure_modes is given, features_for_training must include 'problemcode' as the first element
            # if model_config['features_for_training'][0] == 'faildate':
            #     raise ValueError('invalid model_pipeline.features_for_training=%s, it must not has ":faildate" as the first element when model_pipeline.failure_modes=%s is used' % (model_config['features_for_training'], model_config['failure_modes']))

            failure_probability = model_config['predictions'][0]

            orig_postfix = ''
            m = re.compile(r'_%s$' % base, re.I).search(failure_probability)
            if m is not None:
                orig_postfix = m.group(0)
                failure_probability = failure_probability[0:m.span()[0]]

            # if failure_modes is a list, encode/transform it to a dict
            if isinstance(model_config['failure_modes'], list):
                # dict of {failure mode label: [prediction list index, bool whether it persents in the trained model outputs]}
                model_config['failure_modes'] = {label: [idx+1, True] for idx, label in enumerate(model_config['failure_modes'])}
                # set to ppeline level so it gets registered as part of the model instance metadata
                self.model_pipeline['failure_modes'] = model_config['failure_modes']

            if not all([isinstance(label, str) and len(label.strip()) > 0 and isinstance(id_num_n_present, list) and len(id_num_n_present) == 2 for label, id_num_n_present in model_config['failure_modes'].items()]):
                raise ValueError('invalid model_pipeline.failure_modes=%s, it must be a list of string or dict of string key and [int, bool] value' % model_config['failure_modes'])

            #TO DO: need to add back  1-13-2020
            """ p = re.compile(r'^[\w-]+$', re.I)
            if not all([p.match(label) is not None for label, _ in model_config['failure_modes'].items()]):
                raise ValueError('invalid model_pipeline.failure_modes=%s, which can only use labels consisting of [a-zA-Z0-9_\\-]+' % model_config['failure_modes'])
 """
            if model_config.get('multiclass', False) == True:
                reversed_mappings = {id_num_n_present[0]: label for label, id_num_n_present in model_config['failure_modes'].items()}
                failure_mode_list = [reversed_mappings[idx+1] for idx in range(len(model_config['failure_modes']))]
                # convert 'PUMPS/STOPPED LOOSE/JAM' to 'PUMPS_STOPPED__LOOSE_JAM'
                failure_mode_list2=[ x.replace('/','_').replace(' ','__') for x in failure_mode_list]
                model_config['predictions'][0:1] = ['%s_%s%s' % (failure_probability, x.lower(), orig_postfix) for x in failure_mode_list2]


        else:
            if model_config.get('multiclass', False) == True:
                # TODO can we allow empty failure_modes to represent using all available values from failure history?
                raise ValueError('invalid model_pipeline.failure_modes=%s, it must be given when model_pipeline.multiclass=%s' % (model_config.get('failure_modes', None), model_config['multiclass']))

            model_config['failure_modes'] = dict()

        # append prediction window size to the prediction outputs as postfix, if not already so
        for i, prediction in enumerate(model_config['predictions']):
            if re.compile(r'_%s$' % base, re.I).search(prediction) is None:
                model_config['predictions'][i] = '%s_%s' % (model_config['predictions'][i], base.lower())

        # rollowing window size config valiation
        if model_config.get('rolling_window_size', None) is None:
            model_config['rolling_window_size'] = model_config['prediction_window_size']
        try:
            base = to_offset(model_config['rolling_window_size'])
            if type(base) not in [Second, Minute, Hour, Day]:
                raise ValueError('invalid model_pipeline.rolling_window_size=%s, unsupported offset alias, can only be one of S, T, H and D (case insensitive)' % model_config['rolling_window_size'])
        except ValueError as e:
            raise ValueError('invalid model_pipeline.rolling_window_size=%s: %s' % (model_config['rolling_window_size'], e))

        # aggregation methods config validation
        if model_config.get('aggregation_methods', None) is None:
            model_config['aggregation_methods'] = ['mean', 'max', 'min', 'median', 'std', 'sum', 'count']
        if isinstance(model_config['aggregation_methods'], dict):
            for ft, aggrs in model_config['aggregation_methods'].items():
                invalid_aggs = set(aggrs) - {'mean', 'max', 'min', 'median', 'std', 'sum', 'count'}
                if len(invalid_aggs) > 0:
                    raise ValueError('invalid model_pipeline.aggregation_methods=%s, of which feature=%s has unknown_methods=%s' % (model_config['aggregation_methods'], ft, invalid_aggs))
        else:
            invalid_aggs = set(model_config['aggregation_methods']) - {'mean', 'max', 'min', 'median', 'std', 'sum', 'count'}
            if len(invalid_aggs) > 0:
                raise ValueError('invalid model_pipeline.aggregation_methods=%s, unknown_methods=%s' % (model_config['aggregation_methods'], invalid_aggs))
   
        # if provide features_for_training, use it
        if model_config.get('features_for_training', None) is not None and len(model_config['features_for_training']) > 0:
            # when falure_modes is given, features_for_training must include 'problemcode' as the first element
            if model_config.get('failure_modes', None) is not None and len(model_config['failure_modes']) > 0 and model_config['features_for_training'][0] == 'faildate':
                raise ValueError('invalid model_pipeline.features_for_training=%s, it must not has ":faildate" as the first element when model_pipeline.failure_modes=%s is used' % (model_config['features_for_training'], model_config['failure_modes']))
        else:
            # if not provide features_for_training, infer it from model_config
            infered_features_for_training = []
            if model_config.get('multiclass', None) is None:
                infered_features_for_training.append(':faildate')
                #pass
            else:
                if model_config['multiclass'] == False:
                    # TODO how to deal with multiclass = False ?
                    infered_features_for_training.append(':faildate')
                    #pass
                else:
                    if model_config.get('failure_modes', None) is not None and len(model_config['failure_modes']) > 0:
                        failure_modes = model_config['failure_modes']
                        if isinstance(failure_modes, str):
                            failure_modes = [failure_modes]
                        for failure_mode in failure_modes:
                            # one_failure_mode = 'PUMPS/STOPPED'
                            number_of_slash = failure_mode.count('/')
                            if number_of_slash == 0:
                                # failure_mode='PUMPS'
                                infered_features_for_training.extend([':classcode'])
                            elif number_of_slash == 1:
                                # failure_mode='PUMPS/LEAKING'
                                infered_features_for_training.extend([':problemcode'])
                            elif number_of_slash == 2:
                                # failure_mode='PUMPS/LEAKING/SEALBROKEN'
                                infered_features_for_training.extend([':causecode'])
                            elif number_of_slash == 3:
                                # failure_mode='PUMPS/LEAKING/SEALBROKEN/REPLACE'
                                infered_features_for_training.extend([':remedycode'])
                    else:
                        raise ValueError('model_pipeline.failure_modes must be an non-empty array')

            # remove duplicate and keep order
            # infered_features_for_training = list(set(infered_features_for_training))
            infered_features_for_training = sorted(set(infered_features_for_training), key=lambda x: infered_features_for_training.index(x))

            # modify features_for_training
            model_config['features_for_training'] = [feature.replace(':', '') for feature in infered_features_for_training]
            
            # modify targets
            model_config['targets'] = [feature.replace(':', '') for feature in infered_features_for_training]
            
            # modify inputs
            # if model_config.get('inputs', None) is not None:
            inputs_list = list(model_config['inputs'])
            inputs_list.extend(infered_features_for_training)
            del model_config['inputs']
            model_config['inputs'] = tuple(inputs_list)
            
            # modify rename_inputs
            # if model_config.get('renamed_inputs', None) is not None:
            rename_input_list = list(model_config['renamed_inputs'])
            rename_input_list.extend([feature.replace(':', '') for feature in infered_features_for_training])
            del model_config['renamed_inputs']
            model_config['renamed_inputs'] = tuple(rename_input_list)

    def prepare_execute(self, pipeline, model_config):
        """This class overrides this method to use two estimators: `pmlib.failure_prediction.FailurePredictionEstimator` and `pmlib.failure_prediction.FailurePredictionRcaEstimator`.

        The pipeline has `pmlib.transformer.SimpleSummaryStatistics` as the first stage to generate 
        additional summary statistics as features for model inputs. This is for both training and 
        scoring.

        Then one transfomer is added to both estimators as a training-only preprocessor:

        * `pmlib.failure_prediction.FailurePredictionEstimatorFeatureExtraction`

        It is used to extract asset failure labels.

        Note that internally the two estimators are setup to be separate sequential stags on the pipeline.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_execute`.
        """

        super().prepare_execute(pipeline, model_config)

        if len(model_config['aggregation_methods']) > 0:
            # note that 'features' array is copied first to be passed to simple summary statistics transformer 
            # because later we will extend it for downstream stages with additional features generated by it. If 
            # we don't copy, then the same array (after extended) would be used as input by simple summary 
            # statistics transformer.

            if isinstance(model_config['aggregation_methods'], dict):
                for ft, aggrs in model_config['aggregation_methods'].items():
                    pipeline.add_stage(SimpleSummaryStatistics(features=[ft], aggregation_methods=aggrs, rolling_window_size=model_config['rolling_window_size']))
                    model_config['features'].extend([(ft + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in aggrs])
            else:
                pipeline.add_stage(SimpleSummaryStatistics(features=model_config['features'].copy(), aggregation_methods=model_config['aggregation_methods'], rolling_window_size=model_config['rolling_window_size']))

                # add the generated satatistics into model_config['features']
                model_config['features'].extend([(feature + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in model_config['aggregation_methods'] for feature in model_config['features']])

        # because we have 2 models each using one prediction output name, we need to tweak the prediction parameter 
        # accordingly before passing to create each estimator. first preseve the original list
        predictions = model_config['predictions']

        # get the multiclass flag
        multiclass = model_config.get('multiclass', False) == True

        # get smart classfication flag
        smartclassification = model_config.get('smartclassification', False) == True

        # get the failure modes
        if model_config.get('failure_modes', None) is None:
            model_config['failure_modes'] = {}

        # add the 1st estimator for failure prediction

        model_config['predictions'] = predictions[0:-1] if multiclass else predictions[0:1]
        estimator = MulticlassFailurePredictionEstimator(**model_config) if multiclass else FailurePredictionEstimator(**model_config)
        #if multiclass:
        #    estimator = MulticlassFailurePredictionEstimator(**model_config)
        #else:
        #    if smartclassification:
        #        estimator = FailurePredictionEstimatorSmartClassification(**model_config)
        #    else:
        #       estimator = FailurePredictionEstimator(**model_config)

        pipeline.add_stage(estimator) # NOTE must add estimator to the pipeline first before adding its training preprocessors


        # Add training preprocessing stages
        data_quality_advisor = model_config.get('data_quality_advisor',True)

        model_config['data_quality_sensor_threshold'] = model_config.get('data_quality_sensor_threshold',0)
        model_config['data_quality_failure_history_threshold'] = model_config.get('data_quality_failure_history_threshold',0)
        

        
        estimator.add_training_preprocessor(FailurePredictionEstimatorFeatureExtraction(
                feature=model_config['features_for_training'][0],
                prediction_window_size=model_config['prediction_window_size'],
                multiclass=multiclass,
                failure_modes=model_config['failure_modes']))

        # need failure_id column
        if model_config.get('use_labeled_data', True) and len(model_config['features_for_training']) > 0:
            if data_quality_advisor:
                estimator.add_training_preprocessor(DataQualityAdvisor(
                    model_config['features_for_training'][0],
                    minimum_acceptable_data_quality_score=model_config.get('minimum_acceptable_data_quality_score',0)
                ))
            else:
                self.logger.debug("skip data quality advisor")
        
        # add the 2nd RCA estimator
        model_config['predictions'] = predictions[-1:] if multiclass else predictions[1:2]
        estimator2 = MulticlassFailurePredictionRcaEstimator(**model_config) if multiclass else FailurePredictionRcaEstimator(**model_config)
        pipeline.add_stage(estimator2) # NOTE must add estimator to the pipeline first before adding its training preprocessors

        # Add training preprocessing stages
        estimator2.add_training_preprocessor(FailurePredictionEstimatorFeatureExtraction(
                feature=model_config['features_for_training'][0],
                prediction_window_size=model_config['prediction_window_size'],
                multiclass=multiclass,
                failure_modes=model_config['failure_modes']))

        # restore the original prediction output list
        model_config['predictions'] = predictions


    def get_prediction_backtrack(self, model_config, **kwargs):
        """This class overrides this method to expand the backtrack window by one extra rolling window size.

        This is so because the initial 'rolling-window' of data would be cut off.

        See `pmlib.pipeline.AssetGroupPipeline.get_prediction_backtrack`.
        """

        start, end = super().get_prediction_backtrack(model_config, **kwargs)
        start.append(to_offset(model_config['rolling_window_size']))
        start.append(to_offset('30d')) # for simplicity, we go back 30 days. Essentially grab 30 day's data. 
        return [start, end]


class FailurePredictionEstimator(SromEstimator):
    """Failure probability estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.

    This class is a simple wrapper of the underlying SROM library.
    """

    def __init__(self, features, targets, predictions, **kwargs):
        super().__init__(features=features, targets=targets, predictions=predictions, **kwargs)
        self.kwargs = kwargs
        #print('In FailurePredictionEstimator __init()__')
        for k,v in self.kwargs.items():
            #print (item)
            if k == 'pre_trained_model':
                self.pre_trained_model = v

    def configure_pipeline(self, srom_pipeline):
        srom_pipeline.add_input_meta_data(label_column=self.features_for_training[0])
        srom_pipeline.set_scoring('accuracy')
        return srom_pipeline

    def get_stages(self, df):
        return [
            [
                NoOp(), 
                StandardScaler(), 
                RobustScaler(), 
                MinMaxScaler()
            ],
            [
                PCA(), 
                SelectKBest(k=2)
            ],
            [ 
                DecisionTreeClassifier(),
                AdaBoostClassifier(),
                RandomForestClassifier()
            ]
        ]

    def get_prediction_result_value_index(self):
        return (1, 1)

    def train_model(self, df):
        from srom.classification.smart_classification import \
            SmartClassification
        self.logger.info('Beinning model training using SmartClassification...')
        self.logger.debug('Model training input: df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.info('Beginning model training using SmartClassification... self.pre_trained_model='+str(self.pre_trained_model))
        

        label_column = self.features_for_training[0]
        all_columns = list(df.columns)
        self.logger.debug('Training DF columns: %s', all_columns)
        self.logger.debug('Training override_training_stages=%s', self.override_training_stages)
        
        #X_col = all_columns
        X_col = list(set(df.columns.values) - set([label_column]))
        self.logger.debug('Training DF X_col: %s', X_col)
        y_col = [label_column]

        smart_classification_option=dict()
        
        #if(self.kwargs.get('scoring',None) != None):
            #smart_classification_option['scoring'] =self.kwargs.get('scoring',None)
        #else:
            #smart_classification_option['scoring'] = 'accuracy'
        
        smart_classification_option['mode'] = self.kwargs.get('mode','auto')
        smart_classification_option['scoring'] = self.kwargs.get('scoring','accuracy')
        
        # set default execution_time_per_pipeline to 3 minutes
        smart_classification_option['execution_time_per_pipeline'] =self.kwargs.get('execution_time_per_pipeline',3)
        #if(self.kwargs.get('execution_time_per_pipeline',None) != None):
        #    smart_classification_option['execution_time_per_pipeline'] =self.kwargs.get('execution_time_per_pipeline',3)

        #if(self.kwargs.get('execution_platform',None) != None):
        #    smart_classification_option['execution_platform'] =self.kwargs.get('execution_platform',None)
        #else:
        #   smart_classification_option['execution_platform']  = None
        smart_classification_option['execution_platform'] =self.kwargs.get('execution_platform','spark_node_random_search')

        #if(self.kwargs.get('total_execution_time',None) != None):
        #    smart_classification_option['total_execution_time'] =self.kwargs.get('total_execution_time',None)
        #else:
        #    smart_classification_option['total_execution_time']  = 20

        smart_classification_option['total_execution_time'] = self.kwargs.get('total_execution_time',20)
        smart_classification_option['num_options_per_pipeline_for_random_search'] = self.kwargs.get('num_options_per_pipeline_for_random_search',10)
        smart_classification_option['num_option_per_pipeline_for_intelligent_search'] = self.kwargs.get('num_option_per_pipeline_for_intelligent_search',30)
        smart_classification_option['n_leaders_for_ensemble'] = self.kwargs.get('n_leaders_for_ensemble',5)
        smart_classification_option['class_imbalance_threshold'] = self.kwargs.get('class_imbalance_threshold',0.2)
        smart_classification_option['predict_proba_adjust'] = self.kwargs.get('predict_proba_adjust',True)

        other_parameters = ['level','save_prefix','cv','num_options_per_pipeline_for_random_search', \
                            'num_option_per_pipeline_for_intelligent_search','mode','n_leaders_for_ensemble','class_imbalance_threshold'\
                            ]
        for parameter in other_parameters:
            if(self.kwargs.get(parameter, None) != None):
                smart_classification_option[parameter] =self.kwargs.get(parameter)

        if self.override_training_stages is not None:
            self.logger.debug('Applying specifed overriden training stages: %s', self.override_training_stages)
            smart_classification_option['stages'] = self.override_training_stages
            self.logger.debug('SmartClassification options: %s', smart_classification_option)

            ac = SmartClassification(**smart_classification_option)
        else:
            smart_classification_option['stages'] = None

            ac = SmartClassification(**smart_classification_option)

        smart_clf_train_x = df[X_col].values
        smart_clf_train_y = df[y_col].values.ravel()

        if self.pre_trained_model is not None:
            self.logger.debug('skip training')
            ac=self.pre_trained_model
        else:
            ac.fit(smart_clf_train_x,smart_clf_train_y)

        #ac.fit(smart_clf_train_x,smart_clf_train_y)

        return ac

class FailurePredictionEstimatorSmartClassification(SromEstimator):
    """Failure probability estimator with Smart Classification.

    This class is a simple wrapper of the underlying SROM library.
    """    

    def get_prediction_result_value_index(self):
        return (1, 1)

    def train_model(self, df):
        from srom.classification.smart_classification import \
            SmartClassification
        self.logger.info('Beginning model training using SmartClassification...')
        self.logger.debug('Model training input: df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        

        label_column = self.features_for_training[0]
        all_columns = list(df.columns)
        self.logger.debug('Training DF columns: %s', all_columns)
        self.logger.debug('Training override_training_stages=%s', self.override_training_stages)
        
        X_col = all_columns
        y_col = [label_column]

        smart_classification_option=dict()
        if(self.kwargs.get('scoring',None) != None):
            smart_classification_option['scoring'] =self.kwargs.get('scoring',None)
        else:
            smart_classification_option['scoring'] = 'accuracy'
        
        # set default execution_time_per_pipeline  to 3 minutes
        if(self.kwargs.get('execution_time_per_pipeline',None) != None):
            smart_classification_option['execution_time_per_pipeline'] =self.kwargs.get('execution_time_per_pipeline',3)

        if(self.kwargs.get('execution_platform',None) != None):
            smart_classification_option['execution_platform'] =self.kwargs.get('execution_platform',None)
        else:
            smart_classification_option['execution_platform']  = None

        # set total_execution_time to 20 minutes
        if(self.kwargs.get('total_execution_time',None) != None):
            smart_classification_option['total_execution_time'] =self.kwargs.get('total_execution_time',20)
        else:
            smart_classification_option['total_execution_time']  = 1

        other_parameters = ['level','save_prefix','cv','num_options_per_pipeline_for_random_search',
                            'num_option_per_pipeline_for_intelligent_search','mode','n_leaders_for_ensemble','class_imbalance_threshold'
                            ]
        for parameter in other_parameters:
            if(self.kwargs.get(parameter, None) != None):
                smart_classification_option[parameter] =self.kwargs.get(parameter)

        if self.override_training_stages is not None:
            smart_classification_option['stages'] = self.override_training_stages
            ac = SmartClassification(**smart_classification_option)
        else:
            smart_classification_option['stages'] = None
            ac = SmartClassification(**smart_classification_option)

        smart_clf_train_x = df[X_col].values
        smart_clf_train_y = df[y_col].values.ravel()
        ac.fit(smart_clf_train_x,smart_clf_train_y)

        return ac

    def predict(self, model, df):
        self.logger.debug('Running prediction for model = %s', model)
        self.logger.debug('in predict , df=: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        return list(zip(model.predict(df.values), model.predict_proba(df.values))) if model is not None else None


class FailurePredictionEstimatorFeatureExtraction(_BaseTransformer):
    """This transformer transforms and expands a failure event feature based on the given prediction window size.

    In order to train a failure probability model, failure events need to be expanded to include also those 
    data points prior to the failure events (and the numbber of data points depends on the desired prediciton 
    window size). For example, if we want to predict the failure probability in the next 10 days, when loading 
    data to train the model, we need to "look-back" 10 days before each failure event and label them as failures 
    as well before feeding to the model as training labels.
    """

    def __init__(self, feature, prediction_window_size, multiclass=False, failure_modes=None):
        """
        Parameters
        ----------
        feature : `str`
            The failure event feature to be transformed.
        prediction_window_size : `str`
            The size of the prediction window, specified in 
            [Pandas offset alias](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases). 
        multiclass : `bool`, optional
            Whether to use multi-class model. Default is False.
        failure_modes : `list` of `str`, optional
            Default is empty list.
        """

        super().__init__()

        if feature is None or not isinstance(feature, str) or len(feature) == 0:
            raise ValueError('parameter feature must be an non-empty string')
        self.feature = feature

        if prediction_window_size is None or not isinstance(prediction_window_size, str) or len(prediction_window_size) == 0:
            raise ValueError('parameter prediction_window_size must be an non-empty string')
        try:
            if type(to_offset(prediction_window_size)) not in {Minute, Hour, Day}:
                raise ValueError('invalid prediction_window_size=%s, unsupported offset alias, can only use H or D' % (prediction_window_size))
        except ValueError as e:
            raise ValueError('invalid prediction_window_size=%s: %s' % (prediction_window_size, e))
        self.prediction_window_size = prediction_window_size

        if multiclass is not None and type(multiclass) != bool:
            raise ValueError('parameter multiclass=%s must be a boolean value' % multiclass)
        self.multiclass = multiclass == True

        if failure_modes is not None and (not isinstance(failure_modes, dict) or any([not isinstance(fm, str) for fm in failure_modes])):
            raise ValueError('parameter failure_modes=%s must be a dict of string keys' % failure_modes)
        self.failure_modes = {} if failure_modes is None else failure_modes


    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Executing Failure Prediction pipeline on feature "%s". Input: df=%s', self.feature, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('Input DataFrame failure records: %s', log_df_info(df[pd.notna(df[self.feature])][self.feature], head=5, logger=self.logger, log_level=logging.DEBUG))

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp
        srom_id_name, srom_timestamp_name = 'asset_id', 'datetime'

        df = df.reset_index()

        # Current implementation:
        # - non multi-class:
        #  - original model, either failure happended (1) or not (0), no class consideration at all
        #  - with failure_modes specified, either certain class(es) of failure happened (1) or not (0)
        # - multi-class:
        #  - each of the specified failure_modes is a class, all others (non-specified failure classes and non-failures) are 0
        #
        # TODO should we drop those non-specified failure classes or treat them as non-failures?

        # filter by the failure_modes given
        self.logger.debug('Using failure_modes=%s', self.failure_modes)
        if len(self.failure_modes) > 0:
            # with failure_modes specified, treat anything else as non-failure events (for training)
            df.loc[~df[self.feature].isin(self.failure_modes), self.feature] = np.nan

            self.logger.debug(
                'Input DataFrame filtered by failure modes: %s', 
                log_df_info(
                    df[pd.notna(df[self.feature])][[df_index_id_name, df_index_timestamp_name, self.feature]], 
                    head=5, 
                    logger=self.logger, 
                    log_level=logging.DEBUG
                )
            )

            failure_modes_present = set(df[self.feature].dropna().unique())
            failure_modes_not_found = set(self.failure_modes.keys()) - failure_modes_present
            if len(failure_modes_not_found) > 0:
                for fm in failure_modes_not_found:
                    self.failure_modes[fm][1] = False # set present flag to be False
                self.logger.debug('Failure modes not found: %s, new failure modes: %s', failure_modes_not_found, self.failure_modes)

            if self.multiclass:
                # encode/transform the problemcode string to numerical value
                df[self.feature] = df[self.feature].map(lambda x: self.failure_modes.get(x, [np.nan])[0])

                self.logger.debug(
                    'Input DataFrame filtered by numerized Multiclass failure modes: %s', 
                    log_df_info(
                        df[pd.notna(df[self.feature])][[df_index_id_name, df_index_timestamp_name, self.feature]], 
                        head=5, 
                        logger=self.logger, 
                        log_level=logging.DEBUG
                    )
                )

                self.logger.debug('Input DataFrame (Multiclass, filtered)=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # for non multi-class, transform all failures to 1 while the rest to NaN
        if not self.multiclass:
            df[self.feature] = np.where(pd.notna(df[self.feature]), 1, np.nan)
            self.logger.debug('Input DataFrame (Non-Multiclass): %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # use clock / calendar floor for settling the time window range for failiure target generation
        if type(to_offset(self.prediction_window_size)) in {Day}:
            df[srom_timestamp_name] = df[df_index_timestamp_name].dt.floor(Day().freqstr)
        elif type(to_offset(self.prediction_window_size)) in {Hour}:
            df[srom_timestamp_name] = df[df_index_timestamp_name].dt.floor(Hour().freqstr)
        elif type(to_offset(self.prediction_window_size)) in {Minute}:
            df[srom_timestamp_name] = df[df_index_timestamp_name].dt.floor(Minute().freqstr)
            
        self.logger.debug('df_input_after_to_base=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        
        df_train = df[pd.isna(df[self.feature])]

        self.logger.debug('Removed failure record in training DataFrame: df_train=%s', log_df_info(df_train, head=5, logger=self.logger, log_level=logging.DEBUG))

        #df_train = df.drop(columns=[self.feature])
        df_train = df_train.drop(columns=[self.feature])
        self.logger.debug('Dropped feature columns on training DataFrame: df_train=%s', log_df_info(df_train, head=5, logger=self.logger, log_level=logging.DEBUG))

        df_failure = df[[df_index_id_name, df_index_timestamp_name, srom_timestamp_name] + [self.feature]]
        df_failure = df_failure.dropna(subset=[self.feature])
        df_failure = df_failure.astype({self.feature: int})
        self.logger.debug('Failure DataFrame: %s', log_df_info(df_failure, head=5, logger=self.logger, log_level=logging.DEBUG))

        if df_failure.empty:
            msg = 'The number of failure data is zero, please check if asset has failure data and if pipeline\'s model_pipeline[\'features_for_training\'] is set correctly'
            self.logger.error(msg)
            raise RuntimeError(msg)

        # generate keys of (assetid, datetime) based on the unit of time interval
        from srom.failure_prediction.preprocessing import generate_key_col
        failure_keys = generate_key_col(df=df_train,
                                        date_clm=srom_timestamp_name,
                                        asset_id_clm=df_index_id_name,
                                        interval=to_offset(self.prediction_window_size).base.name)
        self.logger.debug('Generated failure_keys=%s', log_df_info(failure_keys, head=5, logger=self.logger, log_level=logging.DEBUG))

        # Generating the labels and saving them
        from srom.failure_prediction.preprocessing import \
            generate_failure_targets
        failure_target_table = generate_failure_targets(
            failure_table=df_failure,
            failure_keys=failure_keys,
            failure_detection_window_size=self.prediction_window_size,
            failure_asset_id=df_index_id_name,
            failure_date=srom_timestamp_name,
            failure_id=self.feature)
        self.logger.debug('Generated failure_target_table=%s', log_df_info(failure_target_table, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('Labeled failure_target_table=%s', 
            log_df_info(
                failure_target_table[failure_target_table['target_label'] != 0], 
                head=5, 
                logger=self.logger, 
                log_level=logging.DEBUG
            )
        )

        failure_target_table = failure_target_table.rename(columns={srom_id_name: df_index_id_name})

        #df = pd.merge(df_train, failure_target_table, on=[df_index_id_name, srom_timestamp_name], how='left')
        # fix MergeError: incompatible merge keys [0] dtype('<M8[ms]') and dtype('<M8[ns]'), must be the same type
        # df_train[srom_timestamp_name] is dtype('<M8[ms]')
        df_train[srom_timestamp_name] = df[srom_timestamp_name].astype('datetime64[ns]')

        df = merge_feature_and_failure(df_train, failure_target_table, df_index_id_name, srom_timestamp_name)
        # rename the target label to be the original label/target name
        df = df.rename(columns={'target_label': self.feature})
        df = df.set_index(keys=[df_index_id_name, df_index_timestamp_name])

        self.logger.debug('Final DataFrame=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df


class FailurePredictionRcaEstimator(SromEstimator):
    """Failure probbability root cause estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.

    This class is a simple wrapper of the underlying SROM library.
    """

    
        

    def configure_pipeline(self, srom_pipeline):
        srom_pipeline.add_input_meta_data(label_column=self.features_for_training[0])
        srom_pipeline.set_scoring('accuracy')
        return srom_pipeline

    def get_stages(self, df):
        return [
            [
                # ('All Features', NoOp()), 
               # ('SelectKBest - f_classif 20', SelectKBest(k=20)),
               # ('SelectKBest - f_classif 10', SelectKBest(k=10)),
               # ('SelectKBest - f_classif 5', SelectKBest(k=5)),
                ('SelectKBest - f_classif all', SelectKBest(k='all'))
            ],
            [
                #('Decision Tree Classifier', DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.01,random_state=0))
                ('Decision Tree Classifier', DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.01))
            ]
        ]

    def train_model(self, df):
        # we need to override because we do not want the override_traing_stages to override stage of FailurePredictionRcaEstimator
        self.logger.debug('begin of FailurePredictionRcaEstimator train_model df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        srom_pipeline = self.create_pipeline()
        srom_pipeline = self.configure_pipeline(srom_pipeline)
        srom_pipeline.set_stages(self.get_stages(df))

        if self.get_param_grid() is not None:
            self.srom_training_options['param_grid'] = self.get_param_grid()

        df_train = df
        
        srom_pipeline.execute(
                df_train, 
                **self.srom_training_options)

        if srom_pipeline.get_best_estimator() is None:
            raise RuntimeError('Training failed to find the best estimator, try retraining with more data or try again directly if using random search.')

        srom_pipeline.fit(df_train)

        return srom_pipeline

    def process_prediction_result(self, df, prediction_result, model):
        self.logger.debug('prediction_result_length=%d, prediction_result=%s', len(prediction_result), prediction_result[:10])

        model = model.get_best_estimator()
        self.logger.debug("self.model_timestampp=%s", self.model_timestamp)
        model_path = self.model_timestamp
        self.logger.debug(model)

        dff = self.get_df_for_prediction(df)
        extracted_features_columns = dff.columns
        self.logger.debug('extracted_features_columns=%s', extracted_features_columns)
        features_used = []
        for step in model.steps:
            if isinstance(step[1], SelectKBest):
                mask = step[1].get_support()
                self.logger.debug('step_name=%s, mask=%s', step[0], mask)
                for i, col_name in enumerate(extracted_features_columns):
                    if mask[i]:
                        features_used.append(col_name)

            step_name = step[0]
            if step_name != 'Decision Tree Classifier':
                dff = model.named_steps[step_name].transform(dff)
                self.logger.debug('step_name=%s, dff=%s', step_name, log_df_info(dff, head=5, logger=self.logger, log_level=logging.DEBUG))
            else:
                break

        self.logger.debug('features_used=%s', features_used)
        self.logger.debug('final dff=%s', log_df_info(dff, head=5, logger=self.logger, log_level=logging.DEBUG))

        decision_path_output = self._simple_decision_path(model.named_steps['Decision Tree Classifier'], dff)
        self.logger.debug('decision_path_output_length=%s, decision_path_output=%s', len(decision_path_output), decision_path_output[:10])
        i = 0
        if len(model_path) > 0:
            while i < len(decision_path_output):
                decision_path_output[i] = {'tree': self.predictions[0] + '_' + str(model_path) + '_tree', 'rca_path': decision_path_output[i]}
                i += 1
        self.logger.debug('decision_path_output_length=%s, decision_path_output=%s', len(decision_path_output), decision_path_output[:10])

        df[self.predictions[0]] = decision_path_output

        return df

    def get_model_extra(self, new_model, model_path):
        index = model_path.rfind('_') + 1
        self.model_path = model_path[index: -1]
        extras = []
        best_estimator=new_model.get_best_estimator()
        self.logger.debug('Model best estimator: %s', best_estimator)
        decision_tree = new_model.get_best_estimator().named_steps['Decision Tree Classifier']

        feat_importance1 = decision_tree.feature_importances_
        self.logger.debug('feat_importance1=%s', feat_importance1)

        features_used = []
        extracted_features_columns = self.features
        self.logger.debug('extracted_features_columns=%s', extracted_features_columns)
        for step in new_model.get_best_estimator().steps:
            if isinstance(step[1], SelectKBest):
                mask = step[1].get_support()
                for i, col_name in enumerate(extracted_features_columns):
                    if mask[i]:
                        features_used.append(col_name)
        self.logger.debug('features_used=%s', features_used)

        feat_importance = []
        node = ['X'+str(i) for i in range(len(features_used))]
        for i, feat in enumerate(features_used):
            feat_ = feat.split('__')
            if len(feat_) > 2:
                feat_ = feat_[0:2]
            if feat_importance1[i] != 0:
                # feat_importance.append([node[i], feat, round(feat_importance1[i], 5)])
                feat_importance.append([feat, feat, round(feat_importance1[i], 5)])
        self.logger.debug('feat_importance=%s', feat_importance)

        feat_importance.sort(key=lambda x: x[2], reverse=True)
        self.logger.debug('sorted_feat_importance=%s', feat_importance)

        model_features_path = model_path + '_features'
        extras.append((model_features_path, json.dumps(feat_importance), False, False))

        dot_data = tree.export_graphviz(
            decision_tree, 
            out_file=None,
            filled=True, 
            rounded=True, 
            impurity=False, 
            special_characters=True,
            feature_names=features_used) 
        model_tree_path = model_path + '_tree'
        extras.append((model_tree_path, dot_data, False, False))

        self.logger.debug('Saved extras for model: extras=%s', extras)

        return extras

    def _simple_decision_path(self, estimator, df):
        path = estimator.decision_path(df)   # scipy sparse matrix
        path = path.toarray()  # numpy matrix
        decision_path_list = []
        for i in range(path.shape[0]):
            path1 = [j for j in range(path.shape[1]) if path[i,j] == 1]
            decision_path_list.append(str(path1))
        return decision_path_list


class MulticlassFailurePredictionEstimator(SromEstimator):
    """Failure probbability estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.

    This class is a simple wrapper of the underlying SROM library.
    """

    def __init__(self, features, targets, predictions, **kwargs):
        super().__init__(features=features, targets=targets, predictions=predictions, **kwargs)

        self.multiclass_classification_report = None
        self.multiclass_coverage_error = None        
        self.multiclass_lrap_score = None        
        self.multiclass_label_ranking = None

    
    def configure_pipeline(self, srom_pipeline):
        srom_pipeline.add_input_meta_data(label_column=self.features_for_training[0])
        srom_pipeline.set_scoring(make_scorer(f1_score, average='weighted'))
        return srom_pipeline

    def get_stages(self, df):
        train_y = df[[self.features_for_training[0]]].values

        class_weight = compute_class_weight(class_weight = 'balanced', classes = np.unique(train_y), y = train_y.flatten())
        self.logger.debug('class_weight=%s', class_weight)

        unique_classes = np.unique(train_y)
        self.logger.debug('unique_classes=%s', unique_classes)

        class_weight_dict = dict(zip(unique_classes, class_weight))
        self.logger.debug('class_weight_dict=%s', class_weight_dict)

        return [
            [
                NoOp(),
                StandardScaler(),
                RobustScaler(),
                MinMaxScaler(),
            ],
            [
                PCA(),
                SelectKBest(k=2),
            ],
            [
                DecisionTreeClassifier(),
                RandomForestClassifier(n_estimators=100, class_weight=class_weight_dict),
                AdaBoostClassifier(),
            ],
        ]

    def train_model(self, df):
        self.logger.debug('df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        srom_pipeline = self.create_pipeline()
        srom_pipeline = self.configure_pipeline(srom_pipeline)
        self.logger.info('override_training_stages=%s', self.override_training_stages)
        srom_pipeline.set_stages(self.get_stages(df) if self.override_training_stages is None else self.override_training_stages)

        FPA_train_clms = list(df.columns)
        FPA_train_clms.remove(self.features_for_training[0])
        FPA_test_clms = [self.features_for_training[0]]

        # #Begin of Hack to add 2 to the target_label
        # trainDb = pd.concat([df]*10)
        # trainDb.iloc[0,4] = 2
        # self.logger.debug('after hack trainDb=%s', log_df_info(trainDb, head=100, logger=self.logger, log_level=logging.DEBUG))
        # #end of Hack
        # ada = ADASYN(random_state=42)
        # try:
        #     X_res, y_res = ada.fit_sample(trainDb[FPA_train_clms], trainDb[self.features_for_training[0]])
        # except:
        #     #if we have exception for small sample, just use the trainDb
        #     pass

        trainDb = df

        X_res = trainDb[FPA_train_clms]
        y_res = trainDb[FPA_test_clms]
        self.logger.debug('X_res=%s', log_df_info(X_res, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('y_res=%s', log_df_info(y_res, head=5, logger=self.logger, log_level=logging.DEBUG))

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state = 0)

        self.logger.debug('X_train=%s', log_df_info(X_train, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('y_train=%s', log_df_info(y_train, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        #srom_pipeline.execute(X_res, y_res, max_eval_time_minute=2)
        srom_pipeline.execute(X_train, y_train, max_eval_time_minute=2)

        self.logger.debug('srom_pipeline.best_estimator=%s, srom_pipeline.best_score=%s', srom_pipeline.best_estimator, srom_pipeline.best_score)

        if srom_pipeline.get_best_estimator() is None:
            raise RuntimeError('Training failed to find the best estimator, try retraining with more data or try again directly if using random search.')

        srom_pipeline.fit(X_res, y_res)
        self.logger.debug('after srom_pipeline.fit srom_pipeline=%s', srom_pipeline)


        from sklearn.metrics import classification_report
        y_score = srom_pipeline.predict(X_test)
        y_proba=  srom_pipeline.predict_proba(X_test)

        self.multiclass_classification_report =classification_report(y_test, y_score)
        self.logger.debug('train model multiclass_classification_report=%s', self.multiclass_classification_report)

        from sklearn.metrics import coverage_error

        #self.multiclass_coverage_error = coverage_error(y_test, y_score)
        self.logger.debug('train model multiclass_coverage_error=%s', self.multiclass_coverage_error)

        from sklearn.metrics import label_ranking_average_precision_score

        #self.multiclass_lrap_score = label_ranking_average_precision_score(y_test, y_score)
        self.logger.debug('train model lrap_score=%s', self.multiclass_lrap_score)


        #self.multiclass_label_ranking = label_ranking_loss(y_test, y_score, sample_weight=None)
        self.logger.debug('train model lrap_score=%s', self.multiclass_label_ranking)

        return srom_pipeline

    def process_prediction_result(self, df, prediction_result, model):
        self.logger.debug('prediction_result_length=%d, prediction_result=%s', len(prediction_result), prediction_result[:10])

        if prediction_result is None:
            for prediction_output in self.predictions:
                df[prediction_output] = np.nan

            self.logger.debug('No suitable model found. Created null predictions')
        else:
            prediction_results = {idx+1: list() for idx in range(len(self.predictions))}

            failure_mode_id_num = []
            for failure_mode, id_num_n_present in self.failure_modes.items():
                id_num, present = id_num_n_present
                if present:
                    failure_mode_id_num.append(id_num)
            failure_mode_id_num.sort()
            failure_mode_id_num.insert(0, None) # mode 0, non-failure, is always present

            for predicted_failure_mode, probability in prediction_result:
                for idx, prob in enumerate(probability):
                    if idx == 0:
                        continue
                    prediction_results[failure_mode_id_num[idx]].append(prob)

            for idx, prediction_output in enumerate(self.predictions):
                df[prediction_output] = prediction_results[idx+1] if len(prediction_results[idx+1]) > 0 else None

            self.logger.debug('df_final_prediction_result=%s', log_df_info(df[self.predictions], head=10))

        return df

    def get_model_extra(self, new_model, model_path):
        extras = []

        model_json_path = model_path + '_multiclass_classification_metric_json'

        new_model = {'multiclass_classification_report': self.multiclass_classification_report, 
                    'multiclass_coverage_error': self.multiclass_coverage_error,
                    'multiclass_lrap_score': self.multiclass_lrap_score,
                    'multiclass_label_ranking' : self.multiclass_label_ranking}
        extras.append((model_json_path, json.dumps(new_model), False, False)) # no pickle dump, not binary

        self.logger.debug('extras=%s', extras)

        return extras


class MulticlassFailurePredictionRcaEstimator(FailurePredictionRcaEstimator):
    """Failure probbability root cause estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.

    This class is a simple wrapper of the underlying SROM library.
    """

    def get_stages(self, df):
        # TO DO need to change k=1 and k=2 to 10 and 20
        return [
            [
                ('SelectKBest - f_classif 2', SelectKBest(k=4)),
            ],
            [
                ('Decision Tree Classifier', DecisionTreeClassifier(max_depth=6, min_samples_leaf=0.002 ))
            ]
        ]
        # return [
        #     [
        #         # ('All Features', NoOp()), 
        #         ('SelectKBest - f_classif 10', SelectKBest(k=10)),
        #         ('SelectKBest - f_classif 5', SelectKBest(k=5))
        #     ],
        #     [
        #         ('Decision Tree Classifier', DecisionTreeClassifier(max_depth=5, min_samples_leaf=0.01))
        #     ]
        # ]

    # def get_df_for_training(self, df):
    #     # for RCA model, we need the target_label for training
    #     # df = df['target_label']
    #     # df = df.reset_index(drop=True)
    #     # df = df.dropna(axis=0,how='any')
    #     # df = df.astype({'target_label': 'int64'})

    #     # TODO since we do not have enough data, use csv for testing

    #     df = pd.DataFrame()

    #     # df = pd.read_csv('pmlib/tests/auto_regression_kewei.csv')
    #     # df = pd.read_csv('pmlib/tests/decision_tree.csv' )
    #     # generate random integer values
    #     from numpy.random import randint, seed
    #     from random import random, randrange, seed
    #     # seed random number generator

    #     # reset the seed
    #     seed(1)
    #     a = random()
    #     b = random()
    #     seed(2)

    #     # generate random numbers between 0-1
    #     df['maxsecondarycurrent'] = randint(0, 1000, 20846)
    #     df['voltage'] = randint(0, 1000, 20846)

    #     # change type form int64 to float64
    #     df['voltage'] = df['voltage'].astype('float64')
    #     df['maxsecondarycurrent'] = df['maxsecondarycurrent'].astype('float64')

    #     df['maxsecondarycurrent__min__1d'] = df['maxsecondarycurrent'] * (a + b)
    #     df['voltage__min__1d'] = df['maxsecondarycurrent']*b

    #     # the target
    #     my_randoms=[]
    #     for i in range(df.shape[0]):
    #         my_randoms.append(randrange(0,3,1))
    #     df[self.features_for_training[0]] = my_randoms

    #     return df



class DataQualityAdvisor(_BaseTransformer):
    """This DataQualityAdvisor check athe data quality.
    
    """

    def __init__(self,feature,minimum_acceptable_data_quality_score = 0, dq_threshold_failure = 0, **kwargs):
        """
        Parameters
        ----------
        feature : `str`
            The failure event feature (target variable)
        minimum_acceptable_data_quality_score = 0 : `float`
            The data quality score below which the pipeline execution will be stopped
        dq_threshold_failure = 0 : `float`
            Not used, and will be removed in the future.
        """

        super().__init__()

        self.feature = feature
        self.minimum_acceptable_data_quality_score = minimum_acceptable_data_quality_score
        self.sensor_timestamp_column_name = kwargs.get('sensor_timestamp_column_name', 'evt_timestamp')
        self.sensor_asset_id_column_name = kwargs.get('sensor_asset_id_column_name', 'id')
        self.failure_timestamp_column_name = kwargs.get('failure_timestamp_column_name', 'evt_timestamp')
        self.failure_asset_id_column_name = kwargs.get('failure_asset_id_column_name', 'id')

        


    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Begin DataQualityAdvisor df_input=%s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('Begin DataQualityAdvisor self.dq_threshold_sensor =%s', self.minimum_acceptable_data_quality_score)
        self.logger.debug('Begin of DataQualityAdvisor self.dq_threshold_failure =%s', self.minimum_acceptable_data_quality_score)
        

        # get sensor data
        #sensor_table = df[df[self.feature] == 0]
        sensor_table = df.reset_index(inplace = False, drop = False)
        self.logger.debug('After resetting index, sensor_table=%s', log_df_info(sensor_table, head=0, logger=self.logger, log_level=logging.DEBUG))

        # get failure data, for multiclass, it cann be 1,2,3 
        failure_table = df[df[self.feature] != 0].reset_index(inplace = False, drop = False)

        #sensor_table = df[pd.isna(df[self.feature])]
        
        self.logger.debug('After resetting index, failure_table=%s', log_df_info(failure_table, head=0, logger=self.logger, log_level=logging.DEBUG))


        # sensor table metadata
        #sensor_table.reset_index(inplace = True)

        FPA_train_clms = list(sensor_table.columns)

        #sensor_asset_col = 'id'
        #sensor_timestamp_col = self.event_timestamp_column_name
        #sensor_timestamp_col = 'evt_timestamp'

        #sensor_table[self.sensor_asset_id_column_name] = sensor_table[self.sensor_asset_id_column_name].astype('str')
        #sensor_table[self.sensor_time_stamp_column_name] = sensor_table[self.sensor_time_stamp_column_name].astype('str')

        
        del FPA_train_clms[FPA_train_clms.index(self.feature)]
        del FPA_train_clms[FPA_train_clms.index('datetime')]
        del FPA_train_clms[FPA_train_clms.index(self.sensor_asset_id_column_name)]
        del FPA_train_clms[FPA_train_clms.index(self.sensor_timestamp_column_name)]
        data_col = FPA_train_clms

        # failure table metadata
        #failure_asset_col = 'id'
        #failure_timestamp_col = 'event_timestamp'
        #failure_timestamp_col = 'evt_timestamp'
        #failure_col = self.feature

        from dqlearn.pipeline.fpa import FPADataQuality


        FPA_DQ = FPADataQuality(params={
            'sensor_asset_col':self.sensor_asset_id_column_name,
            'sensor_timestamp_col':self.sensor_timestamp_column_name,
            'data_col':data_col,
            'failure_asset_col':self.failure_asset_id_column_name,
            'failure_timestamp_col':self.failure_timestamp_column_name,
            'failure_col':self.feature
        })

        sensor_table = sensor_table[[self.failure_asset_id_column_name, self.sensor_timestamp_column_name] + FPA_train_clms]

        sensor_table[self.sensor_asset_id_column_name] = sensor_table[self.sensor_asset_id_column_name].astype('str')
        sensor_table[self.sensor_timestamp_column_name] = sensor_table[self.sensor_timestamp_column_name].astype('str')
        
        self.logger.info('Running sensor data quality. This prints out entire DataFrames to stdout and ignores the log level.')
        data_quality_pipeline = FPA_DQ.sensor_data_quality(sensor_table)

        self.logger.info('Printing DataQualityAdvisor result report. This report will print regardless of log level and can\'t be changed. Report:')        
        data_quality_pipeline.result_report()

        data_quality_report = data_quality_pipeline.report.report

        data_quality_score = data_quality_report.get('score',0)

        if data_quality_score >= self.minimum_acceptable_data_quality_score :
            self.logger.info(
                'Data Quality checks for the Failure Prediction training sensor features passed the treshold minium score required = %s, score computed by data quality advisor = %s', 
                self.minimum_acceptable_data_quality_score, 
                data_quality_score
            )
        else:
            raise RuntimeError(f'Training pipeline flow stopped because the data quality score for Failure prediction {data_quality_score} is less than the minimum required {self.minimum_acceptable_data_quality_score}. Either try to lower the expected score or clean up the data.')

        
        from dqlearn.ui.pretty_display import pretty_valid, pretty_valid_list

        valid_outputs = data_quality_pipeline.validator_results()
        for flow in valid_outputs.keys():
            for layer in valid_outputs[flow]:
                if valid_outputs[flow][layer]=='not_executed':
                    pass
                elif isinstance(valid_outputs[flow][layer], dict):
                    pretty_valid_list(valid_outputs[flow][layer].values())
                else:
                    pretty_valid(valid_outputs[flow][layer])

        #failure_table.reset_index(inplace=True)
        failure_table = failure_table[[self.failure_asset_id_column_name, self.failure_timestamp_column_name,self.feature ]]
        failure_data_quality_pipeline = FPA_DQ.failure_data_quality(failure_table)

        failure_data_quality_pipeline.result_report()

        failure_data_quality_report = failure_data_quality_pipeline.report.report
        failure_data_quality_score = failure_data_quality_report.get('score',0)

        if failure_data_quality_score >= self.minimum_acceptable_data_quality_score:
            self.logger.info(
                'Data Quality checks for the Failure Prediction training failure features passed the treshold minium score required = %s, score computed by data quality advisor = %s', 
                self.minimum_acceptable_data_quality_score, 
                data_quality_score
            )
        else:
            raise RuntimeError(f'Training pipeline flow stopped because the data quality score for Failure prediction failure data {failure_data_quality_score} is less than the minimum required {self.minimum_acceptable_data_quality_score}. Either try to lower the expected score or clean up the data.')
       
        return df


