# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""Predicted Failure Date Model.

This model uses the survival_analysis package of SROM supporting various algorithms to compute the survival functions and median time to failure. The survival analysis package handles both survival and failure analysis.

Survival analysis is a branch of statistics for analyzing the expected duration of time until one or more events happen including, but not limited to, failure in mechanical systems. This topic is called reliability theory or reliability analysis in engineering, duration analysis or duration modeling in economics, and event history analysis in sociology. Survival analysis attempts to answer questions such as: what is the proportion of a sample that will survive past a certain time? Of those that survive, at what rate will they fail? How do particular circumstances or characteristics or causes increase or decrease the probability of survival or failure?

Broadly speaking, survival analysis involves the modeling of time to event data; in this context involving industrial processes and assets, failure or fault could be considered an "event". The key assumption here is a single event occurs for each subject, after which the subject (asset or a process) is stopped or failed - meaning it ceases to exist or survive.
"""
import json
import logging

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import (Day, Hour, Minute, MonthBegin, MonthEnd,
                                    Second, Week)

from .estimator import SromEstimator
from .pipeline import AssetGroupPipeline
from .time_to_event_feature_eng import TSAggOps
from .transformer import _BaseTransformer
from .util import log_df_info


class TimeToFailureAssetGroupPipeline(AssetGroupPipeline):
    """Predicted failure date model pipeline.

    This `pmlib.pipeline.AssetGroupPipeline` sub-class implements the predicted failure date model.

    Trained predicted failure date model generates a predicted time to failure in days for each
    incoming event. You can choose the names of the output by key `predictions` of the dict parameter
    `model_pipeline` (`predictions` here is always a length-one array which is the predicted time to
    failure in seconds).

    As for the inputs to the model, you can use `features` of the dict parameter `model_pipeline` to
    specify the attributes from assets and/or IoT devices. As for the `features_for_training` of
    the dict parameter `model_pipeline`, it is required to be a length-two array containing two
    datetime attribute representing the asset installation date and asset failure history. The order matters.

    Here's a typical example of how to create an object of this class if you want to use Survive Analysis model:

    ```
    TimeToFailureAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],

            'predictions': ['predicted_time_to_failure','predicted_failure_date'],
            'srom_training_options': {
                'exectype': 'single_node_complete_search'
            }
        })
    ```

    Here's a typical example of how to create an object of this class if you want to use Smart Regression:

    ```
    TimeToFailureAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],

            'predictions': ['predicted_time_to_failure','predicted_failure_date'],

            'failure_mode': 'PUMPS/LOWPRES',

            'aggregation_methods':['min'],
            'aggregate_window_size': 3,
            # 'aggregate_type_for_prediction_interval': 'mean',
            'smart_regression':True
        })
    ```

    This model pipeline also addds a post-processing phase to generate a daily maximum predicted time
    to failure. The name of this daily predicted time to failure is simply the specified output name
    prefixed with 'daily_'.

    See base class for details on all the parameters available.
    """

    def __init__(self, **kwargs):
        if self.__class__ == TimeToFailureAssetGroupPipeline:
            kwargs['model_template_name'] = 'Predicted Failure Date'
            kwargs['model_template_desc'] = None
        super().__init__(**kwargs)

    def default_summary(self, model_config):
        """This class generates by default a daily maximum predicted time to failure.

        See `pmlib.pipeline.AssetGroupPipeline.default_summary`.
        """




        unit_name = None

        # aggregate_window_size
        #unit = type(to_offset(model_config['aggregate_window_size']).base)

        # default aggregate_window_size to 1D
        unit = type(to_offset(model_config.get('aggregate_window_size','1D')).base)
        if unit == Hour:
            unit_name = 'hourly'
        elif unit == Day:
            unit_name = 'daily'
        elif unit == Minute:
            unit_name = 'minute'
        else:
            raise ValueError('invalid model_pipeline.prediction_window_size=%s, unsupported offset alias, can only be one of H and D (case insensitive)' % model_config['prediction_window_size'])

        self.logger.debug('default_summary unit_name=%s', unit_name)

        pred = model_config['predictions']
        num_outputs = len(pred)

        summary = {}
        for idx in range(num_outputs):
            summary['${predictions[%s]}' % str(idx)] = {
                unit_name: {
                    'max': ('${granularity}_${data_item}', '${data_item}'),
                }
            }
                
        return summary

        

    def prepare_model_config(self, model_config):
        """This class overrides this method to set the default value to the following two custom model pipeline
        configuration when not given in the constructor's parameter `model_pipeline`.

        * `rolling_window_size`: by default, if not given, set to be the same as `prediction_window_size`
        * `aggregation_methods`: by default, if not given, set to be `['mean', 'max', 'min', 'median', 'std', 'sum', 'count']`

        It also validates whether the `prediction_window_size` is given and is given correctly and if not
        raises ValueError.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """

        # features validation
        if model_config.get('features', None) is None:
            raise ValueError('model_pipeline.features must be given')
        if not isinstance(model_config['features'], list) or len(model_config['features']) == 0:
            raise ValueError('model_pipeline.features must be an non-empty array')

        if model_config.get('smart_regression', False) and model_config.get('failure_mode', None) is not None and len(model_config['failure_mode']) > 0:
            # one_failure_mode = 'PUMPS/STOPPED'
            one_failure_mode =str( model_config.get('failure_mode')  )
            one_failure_mode = one_failure_mode.replace('/','_').replace(' ','__')

            prediction_list = []
            prediction_list.append('%s_%s' % (model_config['predictions'][0], one_failure_mode.lower()))
            prediction_list.append('prediction_interval_%s_low' % one_failure_mode.lower())
            prediction_list.append('prediction_interval_%s_high' % one_failure_mode.lower())

            if len(model_config['predictions']) > 1:
                prediction_list.append('%s_%s' % (model_config['predictions'][1], one_failure_mode.lower()))

            model_config['predictions'] = prediction_list

            self.logger.debug('predictions=%s', model_config['predictions'])

        # if provide features_for_training, use it
        if model_config.get('features_for_training', None) is not None and len(model_config['features_for_training']) > 0:
            pass
        else:
            # if not provide features_for_training, infer it from model_config
            infered_features_for_training = []
            if not model_config.get('smart_regression', False):
                infered_features_for_training.extend([':installdate',':faildate'])
            else:
                if model_config.get('failure_mode', None) is not None and len(model_config['failure_mode']) > 0:
                    failure_modes = model_config['failure_mode']
                    if isinstance(failure_modes, str):
                        failure_modes = [failure_modes]
                    for failure_mode in failure_modes:
                        number_of_slash = failure_mode.count('/')
                        if number_of_slash == 0: # failure_mode='PUMPS'
                            infered_features_for_training.extend([':faildate',':classcode'])
                        elif number_of_slash == 1: # failure_mode='PUMPS/LEAKING'
                            infered_features_for_training.extend([':faildate',':problemcode'])
                        elif number_of_slash == 2: # failure_mode='PUMPS/LEAKING/SEALBROKEN'
                            infered_features_for_training.extend([':faildate',':causecode'])
                        elif number_of_slash == 3: # failure_mode='PUMPS/LEAKING/SEALBROKEN/REPLACE'
                            infered_features_for_training.extend([':faildate',':remedycode'])
                else:
                    raise ValueError('model_pipeline.failure_mode=%s must be an non-empty array' % model_config.get('failure_mode', None))

            # remove duplicate and keep order
            # infered_features_for_training = list(set(infered_features_for_training))
            infered_features_for_training = sorted(set(infered_features_for_training), key=lambda x: infered_features_for_training.index(x))

            # modify features_for_training
            model_config['features_for_training'] = [feature.replace(':', '') for feature in infered_features_for_training]

            # modify targets
            model_config['targets'] = [feature.replace(':', '') for feature in infered_features_for_training]

            # modify inputs
            # if model_config.get('inputs', None) is not None:
            inputs_list = list(model_config['inputs'])
            inputs_list.extend(infered_features_for_training)
            del model_config['inputs']
            model_config['inputs'] = tuple(inputs_list)

            # modify rename_inputs
            # if model_config.get('renamed_inputs', None) is not None:
            rename_input_list = list(model_config['renamed_inputs'])
            rename_input_list.extend([feature.replace(':', '') for feature in infered_features_for_training])
            del model_config['renamed_inputs']
            model_config['renamed_inputs'] = tuple(rename_input_list)

    def prepare_execute(self, pipeline, model_config):
        """This class overrides this method to use `pmlib.time_to_failure.TimeToFailureEstimatorSrom`.

        A training preprocessor `pmlib.time_to_failure.TimeToFailureEstimatorFeatureExtraction` is used
        to extract two new features to replace the original two: asset installation date and failure
        history.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_execute`.
        """

        super().prepare_execute(pipeline, model_config)

        installdate_column = model_config['features_for_training'][0]
        faildate_column = model_config['features_for_training'][1]
        event_column = 'has_failed'
        duration_column = 'days_run'
        days_to_fail_column = 'days_to_fail'

        multiclass_indicator = model_config.get('smart_regression', False)
        if  multiclass_indicator is None:
            multiclass_indicator = False

        if not multiclass_indicator:
            model_config['features_for_training'] = [event_column, duration_column]
            estimator = TimeToFailureEstimatorSrom(**model_config)
        else:
            model_config['features_for_training'] = [days_to_fail_column]
            estimator = MulticlassTimeToFailureEstimatorSrom(**model_config)

        pipeline.add_stage(estimator)

        unit_name = None
        # Need to pass in minute and hour for time_unit
        unit = type(to_offset(self.pipeline_config.get('aggregate_window_size', '1D')).base)
        if unit == Hour:
            unit_name = 'h'
        elif unit == Day:
            unit_name = 'D'
        elif unit == Minute:
            unit_name = 'm'

        if not multiclass_indicator:
            estimator.add_training_preprocessor(TimeToFailureEstimatorFeatureExtraction(installdate_column=installdate_column, faildate_column=faildate_column, event_column=event_column, duration_column=duration_column))
        else:
            estimator.add_training_preprocessor(TimeToFailureEstimatorFeatureExtractionFailureMode(installdate_column=installdate_column, faildate_column=faildate_column, event_column=event_column, duration_column=duration_column, pipeline_config=model_config,time_unit=unit_name))



    def get_prediction_backtrack(self, model_config, **kwargs):
        """This class overrides this method to expand the backtrack window by one extra rolling window size.

        This is so because the initial 'rolling-window' of data would be cut off.

        See `pmlib.pipeline.AssetGroupPipeline.get_prediction_backtrack`.
        """

        start, end = super().get_prediction_backtrack(model_config, **kwargs)
        #start.append(to_offset(model_config['aggregate_window_size']))
        # if the aggregate_window_size is null for the Survive Analysis model, set it to 1d
        start.append(to_offset(  model_config.get('aggregate_window_size','1d') ) )
        start.append(to_offset('30d')) # for simplicity, we go back 30 days. Essentially grab 30 day's data. 
        return [start, end]

class TimeToFailureEstimatorSrom(SromEstimator):
    """Predicted failure date estimator.
    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.
    This class is a simple wrapper of the underlying SROM library.
    """

    def get_stages(self,df):
        from srom.survival_analysis.aalen_additive_regression import \
            AalenAdditiveRegression
        from srom.survival_analysis.cox_regression import CoxRegression
        from srom.survival_analysis.kaplan_meier import KaplanMeier
        from srom.survival_analysis.nelson_aalen import NelsonAalen

        event_column = self.features_for_training[-2]
        duration_column = self.features_for_training[-1]

        self.logger.debug('event_column=%s, duration_column=%s', event_column, duration_column)

        # commenting out for now- see https://github.ibm.com/maximo/Asset-Health-Insight/issues/13320 for details
        # km = KaplanMeier(duration_column=duration_column, event_column=event_column)
        # na = NelsonAalen(duration_column=duration_column, event_column=event_column)

        cr = CoxRegression(duration_column=duration_column, event_column=event_column)
        aar = AalenAdditiveRegression(duration_column=duration_column, event_column=event_column)

        return [
            [cr, aar]
        ]

    def get_param_grid(self):
        from srom.pipeline.srom_param_grid import SROMParamGrid
        return SROMParamGrid(gridtype='empty')

    def predict(self, model, df):
        # these models do not have predict_proba() hence override this method
        return model.predict(df)

    def get_prediction_result_value_index(self):
        return [0]

    def process_prediction_result(self, df, prediction_result, model):
        super().process_prediction_result(df, prediction_result, model)
        self.event_timestamp_column_name ='evt_timestamp'
        if len(self.predictions) > 1:
            df[self.predictions[1]] = df.index.get_level_values(self.event_timestamp_column_name)
            df[self.predictions[1]] = pd.to_datetime(df[self.predictions[1]], format='%Y-%m-%d')
            df[self.predictions[1]] = df[self.predictions[1]] + pd.to_timedelta(np.floor(df[self.predictions[0]]), unit='D')
            self.logger.debug('Processed prediction result DataFrame: %s',  log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df



class MulticlassTimeToFailureEstimatorSrom(SromEstimator):
    """Predicted failure date estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.

    This class is a simple wrapper of the underlying SROM library.
    """

    """
    This sub-class estimator mainly overrides method `get_stages()` to construct the model pipeline stages.
    This class is a simple wrapper of the underlying SROM library.

    The class for performing the auto-Regression in SROM using a well tested heuristic "Bottom-Up". \
    The model_stages in this class have already been setup from the benchmark results. \
    (link from the results of experimentation can be put here.)

    Parameters:
        level (String): Level of exploration (default or comprehensive).
        save_prefix (string): String prefix for the output save file.
        execution_platform (string): Platform for execution from srom pipeline. Supports spark also.
        cv (int): Value of 'k' in K-crossvalidation. This parameters is used from the sklearn \
                function GridSearchCV. \
                https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
        scoring (Sting, function): The value that defines the metrics for scoring the paths. \
                Can be a string if sklearn defined metrics used. Can be a funtion if a user \
                defined metric is used. This parameters is used from the sklearn function GridSearchCV. \
                https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
        stages (list of list of estimators): A list of list containing the transformer and \
                estimator tuples for customizing the preconfigured auto pipeline.
        execution_time_per_pipeline (int): Integer value denoting time (minutes) of execution \
                per path (path: combination of estimators and transformers)
        num_options_per_pipeline_for_random_search (int): Integer value denoting number \
                of parameters to use while performing randomized param search in *which* rounds.
        num_option_per_pipeline_for_intelligent_search: Integer value denoting number of \
                parameters to use while performing more intelligent param search in *which* rounds.
        total_execution_time (int): Total execution time (minutes) for the auto classification pipeline.
        param_grid (SROMParamGrid): Param grid with various parameter combination.



    """

    def __init__(self, features, targets, predictions, **kwargs):
        super().__init__(features=features, targets=targets, predictions=predictions, **kwargs)


        self.predicted_time_to_failure = self.predictions[0]
        #self.prediction_interval = self.predictions[1]
        self.prediction_interval_low = self.predictions[1]
        self.prediction_interval_high = self.predictions[2]
        self.logger.debug('in MulticlassTimeToFailureEstimatorSrom __init()__ self.prediction_interval_low= %s', self.prediction_interval_low)
        self.logger.debug('in MulticlassTimeToFailureEstimatorSrom __init()__ self.prediction_interval_high= %s', self.prediction_interval_high)
        #self.features_for_prediction = features_for_prediction

        self.features_for_prediction = features

        self.logger.debug('in MulticlassTimeToFailureEstimatorSrom __init()__ self.features_for_prediction= %s', self.features_for_prediction)

        #try:
        #    self.agg_fns = kwargs['aggregation_methods']
        #except:
        #    self.agg_fns = ['min','max','mean','std','median']
        
        self.agg_fns = kwargs.get('aggregation_methods',None)
        print('self.agg_fns = ',self.agg_fns)

        try:
            self.agg_time = kwargs['aggregate_window_size']
        except:
            self.agg_time = 1

        self.logger.debug('in MulticlassTimeToFailureEstimatorSrom __init()__ aggregate_window_size self.agg_time= %s', self.agg_time)

        try:
            self.aggregate_type_for_prediction_interval =kwargs['aggregate_type_for_prediction_interval']
        except:
            self.aggregate_type_for_prediction_interval = 'median'


        if self.aggregate_type_for_prediction_interval == 'mean' or self.aggregate_type_for_prediction_interval == 'median' :
                self.logger.debug('in MulticlassTimeToFailureEstimatorSrom __init()__  self.aggregate_type_for_prediction_interval= %s', self.aggregate_type_for_prediction_interval)
        else:
            raise RuntimeError('invalid aggregate_type_for_prediction_interval. It can only be either mean or median')

        try:
            self.execution_type = kwargs['execution_type']
        except:
            # spark_node_random_search or single_node_random_search
            # In CP4D single_node_random_search is faster than spark_node_random_search because spark_node_random_search can cause timeout
            self.execution_type = 'single_node_random_search'

        try:
            self.cv =kwargs['cv']
        except:
            self.cv = 5

        try:
            self.total_execution_time = kwargs['total_execution_time']
        except:
            self.total_execution_time = 10


        self.number_of_estimators_for_pred_interval = kwargs.get('number_of_estimators_for_pred_interval', 30)

        self.number_of_leaders_for_ensemble = kwargs.get('number_of_leaders_for_ensemble', 5)

        try:
            self.search_level =kwargs['search_level']
        except:
            self.search_level = 'default'

        try:
            self.scoring_metric =kwargs['scoring_metric']
        except:
            self.scoring_metric = 'r2'



        self.r2_score_smart_regression = None
        self.mse_smart_regression  = None

        self.event_timestamp_column_name ='evt_timestamp'


    def get_param_grid(self):
        from srom.pipeline.srom_param_grid import SROMParamGrid
        return SROMParamGrid(gridtype='empty')


    def process_prediction_result(self,df, df_prediction, model):
        #super().process_prediction_result(df, df_prediction, model)
        # we have set the value in the predict() function
        self.logger.debug('Prediction result: self.predictions=%s', self.predictions)
        if len(self.predictions) > 3:
            self.logger.debug('Prediction result df_prediction=%s', log_df_info(df_prediction, head=5, logger=self.logger, log_level=logging.DEBUG))
            df_prediction[self.predictions[3]] = df_prediction.index.get_level_values(self.event_timestamp_column_name)
            df_prediction[self.predictions[3]] = pd.to_datetime(df_prediction[self.predictions[3]], format='%Y-%m-%d')

            unit = type(to_offset(self.agg_time).base)
            if unit == Hour:
                unit_name = 'h'
            elif unit == Day:
                unit_name = 'D'
            elif unit == Minute:
                unit_name = 'm'
            else:
                unit_name = 'D'
            self.logger.debug('process_prediction_result unit_name=%s',unit_name)

            df_prediction[self.predictions[3]] = df_prediction[self.predictions[3]] + pd.to_timedelta(df_prediction[self.predictions[0]], unit=unit_name)
            self.logger.debug('Prediction result after modifications df_prediction=%s', log_df_info(df_prediction, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df_prediction

    def predict(self, model, df):
        # these models do not have predict_proba() hence override this method
        self.logger.debug('Starting prediction with DataFrame: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # df has the column names, but predict function does not need it.
        columns_for_scoring = list(df.columns)

        self.logger.debug('Prediction columns for scoring: %s', columns_for_scoring)


        X = df[columns_for_scoring].values
        self.logger.debug('before model.predict')
        predicted_time_to_failure_value = list(model.predict(X))
        self.logger.debug('after model.predict')
        np_array_predicted_time_to_failure_value = np.array(predicted_time_to_failure_value)
        np_array_predicted_time_to_failure_value_after_round_list = list( np.around(np_array_predicted_time_to_failure_value))
        self.logger.debug('before model.predict_proba')
        result = model.predict_proba(X)
        self.logger.debug('after model.predict_proba')
        predict_interval_low_value = list(result[:,0])

        predict_interval_high_value = list(result[:,1])

        np_array_predict_interval_low_value = np.array(predict_interval_low_value)
        np_array_predict_interval_low_value_after_round_list = list(np.around(np_array_predict_interval_low_value))

        np_array_predict_interval_high_value = np.array(predict_interval_high_value)
        np_array_predict_interval_high_value_after_round_list = list(np.around(np_array_predict_interval_high_value))

        df[self.prediction_interval_low] = np_array_predict_interval_low_value_after_round_list
        df[self.prediction_interval_high] = np_array_predict_interval_high_value_after_round_list

        df[self.predicted_time_to_failure] = np_array_predicted_time_to_failure_value_after_round_list

        self.logger.debug('Outputting prediction in DataFrame: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df

    def get_prediction_result_value_index(self):
        return [0]


    def get_df_for_training(self, df):
        #features = [] + self.features old superlfuous code

        #if self.features_for_training is not None:
        #    features.extend(self.features_for_training)

        #TO DO add logic to check multiclass_indicator # old superfluous code

        #df = df[features] old superfluous code

        #df = df.reset_index(drop=True)
        #return df

        return df.reset_index(drop=True)

    def get_df_for_prediction(self, df):
        self.logger.debug('Getting DataFrame for prediction. Input DataFrame: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        timestamp_column_name = self.event_timestamp_column_name
        device_name_column_name ='id'

        #target_column_name ='days_to_fail'

        #df_for_prediction = df.drop([target_column_name], axis = 1)

        df_for_prediction = df[self.features]

        tsAggOps = TSAggOps(loglevel = self.logger.level)
        
        if self.agg_fns is None:
            df_for_prediction = df.drop(columns=['faildate','problemcode'])
            
        else:
            df_for_prediction = tsAggOps.aggregate_scoring_data(scoring_df = df_for_prediction,
                            device_id_column_name = device_name_column_name,
                            timestamp_column_name = timestamp_column_name, agg_time = self.agg_time,
                            agg_fns = self.agg_fns, columnwise_aggfns = None, cols_for_aggregation = None)
        
        
        self.logger.debug('Prediction DataFrame after feature engineering: %s', log_df_info(df_for_prediction, head=10, logger=self.logger, log_level=logging.DEBUG))

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug('Number of NA records: %s', log_df_info(df=df_for_prediction.isna().sum(), head=10, logger=self.logger, log_level=logging.DEBUG))
            self.logger.debug('NA Values: %s', log_df_info(df_for_prediction[df_for_prediction.isna().any(axis=1)], head=10, logger=self.logger, log_level=logging.DEBUG))

        return df_for_prediction


    def train_model(self, df):
        self.logger.info('Starting model training with input DataFrame: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        from srom.regression.smart_regression import SmartRegression

        column_days_to_fail = 'days_to_fail'

        X_col = list(set(df.columns.values) - set([column_days_to_fail]))

        self.logger.debug('Training features_column=%s', X_col)

        y_col = [column_days_to_fail]

        self.logger.debug('X_col df: %s', log_df_info(X_col, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('y_col df: %s', log_df_info(y_col, head=5, logger=self.logger, log_level=logging.DEBUG))

        X = df[X_col].values
        y = df[y_col].values

        from sklearn.model_selection import train_test_split

        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=99)

        # execution_platform='single_node_random_search',, otherwise AttributeError: 'NoneType' object has no attribute 'fork_exec'
        self.logger.debug('Training search_level=: %s', self.search_level)
        self.logger.debug('Training execution_type=: %s', self.execution_type)

        self.logger.debug('Training override_training_stages=: %s', self.override_training_stages)
        ar = SmartRegression(
            level= self.search_level,
            scoring=self.scoring_metric,
            cv = self.cv,
            total_execution_time= self.total_execution_time,
            stages = self.override_training_stages,
            execution_platform = self.execution_type,
            aggr_type_for_pred_interval = self.aggregate_type_for_prediction_interval,
            save_prefix='smart_regression_output_',
            n_estimators_for_pred_interval = self.number_of_estimators_for_pred_interval,
            n_leaders_for_ensemble = self.number_of_leaders_for_ensemble
        )

        ar.fit(X_train,y_train)

        y_predict=ar.predict(X_test)

        from sklearn.metrics import r2_score
        self.r2_score_smart_regression=r2_score(y_predict, y_test)
        self.logger.debug('R2 Score- SmartRegression: %s', self.r2_score_smart_regression)

        from sklearn.metrics import mean_squared_error
        self.mse_smart_regression=mean_squared_error(y_predict, y_test)
        self.logger.debug('MSE- Smart Regression: %s', self.mse_smart_regression)

        return ar

    def get_model_extra(self, new_model, model_path):
        extras = []

        model_json_path = model_path + '_model_metric_json'

        new_model = {'r2_score': self.r2_score_smart_regression, 'mse': self.mse_smart_regression}
        extras.append((model_json_path, json.dumps(new_model), False, False)) # no pickle dump, not binary

        self.logger.debug('extras=%s', extras)

        return extras

    def get_prediction_backtrack(self, model_config, **kwargs):
        """This class overrides this method to expand the backtrack window by one extra rolling window size.

        This is so because the initial 'rolling-window' of data would be cut off.

        See `pmlib.pipeline.AssetGroupPipeline.get_prediction_backtrack`.
        """

        start, end = super().get_prediction_backtrack(model_config, **kwargs)
        start.append(to_offset(model_config.get('aggregate_window_size', '1D')))
        return [start, end]


class TimeToFailureEstimatorFeatureExtraction(_BaseTransformer):
    """This transformer transforms the given asset installation date and asset failure date into failure label
    and duration features.

    The extraction is performed by the following logic:

    1. For each asset, extract failure label(s) and duration from its installation date, failure dates, and current time.
    2. Cut the life span of an asset, from installation to current time, into multiple segments, starting from
    the installation to the first failure date, then from first failure date to the second, and so on, until
    from the last failure date to current time.
        * If there's no installation date available, then start from the first failure date.
        * If there's no failure date available, then use current time directly.
        * If neither installation date nor failure date is available, ignore the asset.
    3. Calculate the duration for each segment, in days. Label it as failure if the segment's ending is a failure date
    (if ending is current time then it is labled as non-failure).
    """

    def __init__(self, installdate_column, faildate_column, event_column, duration_column):
        """
        Parameters
        ----------
        installdate_column : `str`
            The name of asset installation date feature.
        faildate_column : `str`
            The name of asset failure date feature.
        event_column : `str`
            The name of the extracted failure event feature.
        duration_column : `str`
            The name of the extracted duration feature.
        """

        super().__init__()

        self.installdate_column = installdate_column
        self.faildate_column = faildate_column
        self.event_column = event_column
        self.duration_column = duration_column

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Running time to failure feature extraction on input DataFrame: df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        df_indices = df.index.names
        df = df.reset_index()

        # add failure event column
        df[self.event_column] = np.where(pd.isnull(df[self.faildate_column]), 0, 1)

        number_of_failure_with_null_value = df[self.faildate_column].isnull().sum()
        number_of_total_record = df.shape[0]

        self.logger.debug('Number of failures with null values: %s', number_of_failure_with_null_value)
        self.logger.debug('Total number of records: %s', number_of_total_record)
        
        if number_of_failure_with_null_value == number_of_total_record:
            self.logger.warning('Number of failure record is zero. Please set the failure history of work order in Maximo Manage. The failurecode must have value.')
            raise RuntimeError('Number of failure record is zero. Please set the failure history of work order in Maximo Manage. The failurecode must have value.')

        # now calculate the duration besed on installdate, faildate, and current time
        # 1. if faildate is NA, it must be the one for current time (since last failure or installdate), set its end time to current time
        # 2. for faildate not NA, set the previous faildate as the start time (by shift())
        # 3. for the very first row with faildate, the start time would be NA, set this row's start time to be the asset's installdate
        # 4. calculate the diff of start and end, take days
        # 5. this must be per asset, not mixing time from different assets

        duration_start_column = 'calc_date_start'
        duration_end_column = 'calc_date_end'

        # for each asset, keep the very first and very last rows, along with rows having faildate
        df['__first'] = np.where(pd.notna(df[pd.notna(df.groupby(df_indices[0]).head(1))][df_indices[0]]), True, False)
        df['__last'] = np.where(pd.notna(df[pd.notna(df.groupby(df_indices[0]).tail(1))][df_indices[0]]), True, False)
        df['__kept'] = np.where(df['__first'] | df['__last'] | pd.notna(df[self.faildate_column]), True, None)
        df = df.dropna(subset=['__kept']).drop(columns=['__first', '__last', '__kept'])

        df[duration_end_column] = np.where(pd.notna(df[self.faildate_column]), df[self.faildate_column], df[df_indices[1]])
        df = df.sort_values([df_indices[0], duration_end_column])
        df[duration_start_column] = df.groupby([df_indices[0]])[duration_end_column].shift(1)
        df[duration_start_column] = np.where(pd.notna(df[duration_start_column]), df[duration_start_column], df[self.installdate_column])
        df = df.astype({duration_start_column: 'datetime64[ms]'})
        df[self.duration_column] = (df[duration_end_column] - df[duration_start_column]).dt.days

        # assets without installdate would have the very first row having NA duration, drop those rows
        df = df.dropna(subset=[self.duration_column])

        self.logger.debug('Calculated asset runtime duration: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # clean up columns and set index back
        df = df.drop(labels=[duration_start_column, duration_end_column], axis=1, errors='ignore')
        df = df.set_index(df_indices)

        self.logger.debug('Final duration DataFrame: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df


class TimeToFailureEstimatorFeatureExtractionFailureMode(_BaseTransformer):
    """This transformer transforms the given asset installation date and asset failure date into failure labbel
    and duration features.

    The extraction is performed by the following logic:

    1. For each asset, extract failure label(s) and duration from its installation date, failure dates, and current time.
    2. Cut the life span of an asset, from installation to current time, into multiple segments, starting from
    the installation to the first failure date, then from first failure date to the second, and so on, until
    from the last failure date to current time.
        * If there's no installation date available, then start from the first failure date.
        * If there's no failure date available, then use current time directly.
        * If neither installation date nor failure date is available, ignore the asset.
    3. Calculate the duration for each segment, in days. Label it as failure if the segment's ending is a failure date
    (if ending is current time then it is labled as non-failure).
    """

    def __init__(self, installdate_column, faildate_column, event_column, duration_column,pipeline_config, time_unit):
        """
        Parameters
        ----------
        installdate_column : `str`
            The name of asset installation date feature.
        faildate_column : `str`
            The name of asset failure date feature.
        event_column : `str`
            The name of the extracted failure event feature.
        duration_column : `str`
            The name of the extracted duration feature.
        """

        super().__init__()

        self.installdate_column = installdate_column
        self.faildate_column = faildate_column
        self.event_column = event_column
        self.duration_column = duration_column
        self.pipeline_config = pipeline_config
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode time_unit=%s', time_unit)
        self.ts_agg_helper = TSAggOps(timeunits = time_unit,loglevel = self.logger.level)
        self.event_timestamp_column_name ='evt_timestamp'



    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Begin TimeToFailureEstimatorFeatureExtractionFailureMode execute df_input: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode pipeline_config: %s', self.pipeline_config)

        df_indices = df.index.names
        df = df.reset_index()

        failure_code = self.pipeline_config['targets'][1]
        #failure_code = self.pipeline_config['features_for_training'][1]
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode failure_code=%s', failure_code)

        failure_mode = self.pipeline_config['failure_mode']
        self.logger.debug('failure_code=%s failure_mode=%s', failure_code,failure_mode)

        self.logger.debug('after filter TimeToFailureEstimatorFeatureExtractionFailureMode execute df_input: %s', log_df_info(df, head=50, logger=self.logger, log_level=logging.DEBUG))

        # failure_mode is losse, leaking, overheating, STOPPED
        #failue_mode = self.pipeline_config.get('failureMode')

        # choose only particulare failure_mode in the config
        #df = df.loc[df['problemcode'] == failue_mode]

        #timestamp_column = 'sensor_timestamp'

        timestamp_column = self.event_timestamp_column_name
        device_name_column_name ='id'

        #df.rename(columns = {'_timestamp':timestamp_column}, inplace = True)

        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode df_input: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # assume we have access to the failure record
        target_column_name='days_to_fail'

        features_for_training = self.pipeline_config['features']

        df_fault_data = df[df['faildate'].notnull()]

        df_fault_data = df_fault_data[ df_fault_data[failure_code] == failure_mode]

        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode df_faults_data: %s', log_df_info(df_fault_data, head=5, logger=self.logger, log_level=logging.DEBUG))


        if df_fault_data.shape[0] == 0:
            self.logger.error('Number of failure record is zero. Please set the failure history of work order in Maximo Manage. The failurecode must have value.')
            raise RuntimeError("Number of failure record is zero. Please set the failure history of work order in Maximo Manage. The failurecode must have value.")

        # TO DO change to day
        #mtbf = self.calculate_mtbf(df, df_faults_data, timestamp_column, time_unit='h')
        agg_fns = self.pipeline_config.get('aggregation_methods',None)
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode aggregation_methods=%s', agg_fns)

        agg_time =self.pipeline_config.get('aggregate_window_size',None)

        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode aggregate_window_size=%s', agg_time)

        # calculate for each device in the dataset
        device_name_mtbf_map = {}


        df_sensor_data = df

        df_sensor_data_grouped = df.groupby(device_name_column_name)

        self.logger.debug('device list %s', df_sensor_data_grouped)
        #If agg_time is None, use mtbf, else use the input from pipeline config.
        if agg_time is None :
            self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode aggregation window = %s', agg_time)
            for group_name, df_group_by_device in df_sensor_data_grouped:
                self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode - calculating MTBF for the asset %s', group_name)
                self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode %s has the shape %s', group_name, df_group_by_device.shape)

                # Need to pass in minute and hour for time_unit
                unit = type(to_offset(self.pipeline_config.get('aggregate_window_size', '1D')).base)
                if unit == Hour:
                    unit_name = 'h'
                elif unit == Day:
                    unit_name = 'D'
                elif unit == Minute:
                    unit_name = 'm'
                else:
                    raise ValueError('invalid model_pipeline.prediction_window_size=%s, unsupported offset alias, can only be one of min, h or D (case sensitive)' % self.pipeline_config['aggregate_window_size'])

                self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode - calculating MTBF using the unit_name %s', unit_name)
                mtbf = self.ts_agg_helper.calculate_mtbf(
                    df_sensor_data[df_sensor_data[device_name_column_name] == group_name],
                    df_fault_data[df_fault_data[device_name_column_name] == group_name],
                    timestamp_column,
                    0.5,
                    time_unit=unit_name
                )
                device_name_mtbf_map[group_name] = mtbf
        else:
            for group_name, df_group_by_device in df_sensor_data_grouped:
                self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode - applying the aggregation window %s for the asset %s', agg_time, group_name)
                self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode %s has the shape %s', group_name, df_group_by_device.shape)
                mtbf = agg_time
                print('mtbf = %s', mtbf)
                device_name_mtbf_map[group_name] = mtbf

        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode Agg times for the assets in the data frame = %s', device_name_mtbf_map)

        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode mtbf: %s', mtbf)


        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode df_sensor_data: %s', log_df_info(df_sensor_data, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode df_faults_data: %s', log_df_info(df_fault_data, head=5, logger=self.logger, log_level=logging.DEBUG))

        df_sensor_data = df

        df_sensor_data[timestamp_column] = pd.to_datetime(df_sensor_data[timestamp_column])

        #failure_timestamp_column_name  = 'faildate'
        # dataframe df_faults_data has id not asset_id
        df_time_lagged_data = self.ts_agg_helper.create_time_lagged_records_multiple_devices(
            df_sensor_data, 
            df_fault_data,
            device_name_column_name,
            'id',
            timestamp_column,
            'faildate', 
            target_column_name
        )
        
        self.logger.debug('TimeToFailureEstimatorFeatureExtractionFailureMode df_time_lagged_data: %s', log_df_info(df_time_lagged_data, head=5, logger=self.logger, log_level=logging.DEBUG))

        df= None
        

        if agg_fns is None:
            df =  df_time_lagged_data.drop(columns=['faildate', 'problemcode'])
            self.logger.debug('No aggregation_methods is configured.')
            self.logger.debug(' df_time_lagged_data: %s', log_df_info(df_time_lagged_data, head=5, logger=self.logger, log_level=logging.DEBUG))
            
        else:

            df_time_lagged_aggregated_data= self.ts_agg_helper.aggregate_lagged_records_multiple_devices(
                df_time_lagged_data, 
                df_fault_data, 
                device_name_mtbf_map, 
                lagged_time_col_name = target_column_name, 
                sensor_asset_id_column_name = 'id', 
                failure_asset_id_column_name = 'id', 
                sensor_timestamp_column_name = 'evt_timestamp', 
                failure_timestamp_column_name = 'faildate',
                columnwise_aggfns = None, 
                agg_fns = agg_fns, 
                cols_for_aggregation = features_for_training
            )
            '''
            df_time_lagged_aggregated_data = self.ts_agg_helper.aggregate_lagged_records_multiple_devices( \
                                                df_time_lagged_data, \
                                                df_fault_data, \
                                                device_name_mtbf_map, \
                                                target_column_name, \
                                                device_name_column_name, \
                                                'asset_id' ,'evt_timestamp', 'faildate', columnwise_aggfns = None, agg_fns = agg_fns, cols_for_aggregation = features_for_training)
            '''
            
            self.logger.debug('df_time_lagged_aggregated_data: %s', log_df_info(df_time_lagged_aggregated_data, head=5, logger=self.logger, log_level=logging.DEBUG))

            df =  df_time_lagged_aggregated_data
        
        #reset index
        df = df.set_index(df_indices)
        self.logger.debug('end of TimeToFailureEstimatorFeatureExtractionFailureMode execute df_final: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df
