# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA, 5900-AMG
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

import numpy as np
import pandas as pd
import pytest

import pmlib
import datetime as dt
import json
import logging
import os
import sys
import time
import types

import ibm_boto3
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#import seaborn as sns

#from botocore.client import Config
from sklearn import metrics




from pmlib.failure_prediction import (FailurePredictionAssetGroupPipeline, FailurePredictionEstimator, FailurePredictionRcaEstimator,
                                      FailurePredictionEstimatorFeatureExtraction, MulticlassFailurePredictionEstimator, MulticlassFailurePredictionRcaEstimator)

class myFailurePredictionAssetGroupPipeline(FailurePredictionAssetGroupPipeline):
    def prepare_execute(self, pipeline, model_config):
        """This class overrides this method to use two estimators: `pmlib.failure_prediction.FailurePredictionEstimator` and `pmlib.failure_prediction.FailurePredictionRcaEstimator`.

        The pipeline has `pmlib.transformer.SimpleSummaryStatistics` as the first stage to generate 
        additional summary statistics as features for model inputs. This is for both training and 
        scoring.

        Then one transfomer is added to both estimators as a training-only preprocessor:

        * `pmlib.failure_prediction.FailurePredictionEstimatorFeatureExtraction`

        It is used to extract asset failure labels.

        Note that internally the two estimators are setup to be separate sequential stags on the pipeline.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_execute`.
        """

        # note that 'features' array is copied first to be passed to simple summary statistics transformer 
        # because later we will extend it for downstream stages with additional features generated by it. If 
        # we don't copy, then the same array (after extended) would be used as input by simple summary 
        # statistics transformer.
        #pipeline.add_stage(SimpleSummaryStatistics(features=model_config['features'].copy(), aggregation_methods=model_config['aggregation_methods'], rolling_window_size=model_config['rolling_window_size']))

        # add the generated satatistics into model_config['features']
        #model_config['features'].extend([(feature + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in model_config['aggregation_methods'] for feature in model_config['features']])

        # because we have 2 models each using one prediction output name, we need to tweak the prediction parameter 
        # accordingly before passing to create each estimator. first preseve the original list
        predictions = model_config['predictions']

        # get the multiclass flag
        multiclass = model_config.get('multiclass', False) == True

        # get smart classfication flag
        smartclassification = model_config.get('smartclassification', False) == True

        # get the failure modes
        if model_config.get('failure_modes', None) is None:
            model_config['failure_modes'] = {}

        # add the 1st estimator for failure prediction

        model_config['predictions'] = predictions[0:-1] if multiclass else predictions[0:1]
        estimator = MulticlassFailurePredictionEstimator(**model_config) if multiclass else FailurePredictionEstimator(**model_config)
        #if multiclass:
        #    estimator = MulticlassFailurePredictionEstimator(**model_config)
        #else:
        #    if smartclassification:
        #        estimator = FailurePredictionEstimatorSmartClassification(**model_config)
        #    else:
        #       estimator = FailurePredictionEstimator(**model_config)

        pipeline.add_stage(estimator) # NOTE must add estimator to the pipeline first before adding its training preprocessors

        # Add training preprocessing stages
        estimator.add_training_preprocessor(FailurePredictionEstimatorFeatureExtraction(
                feature=model_config['features_for_training'][0],
                prediction_window_size=model_config['prediction_window_size'],
                multiclass=multiclass,
                failure_modes=model_config['failure_modes']))

        # add the 2nd RCA estimator

        model_config['predictions'] = predictions[-1:] if multiclass else predictions[1:2]
        estimator2 = MulticlassFailurePredictionRcaEstimator(**model_config) if multiclass else FailurePredictionRcaEstimator(**model_config)
        pipeline.add_stage(estimator2) # NOTE must add estimator to the pipeline first before adding its training preprocessors

        # Add training preprocessing stages
        estimator2.add_training_preprocessor(FailurePredictionEstimatorFeatureExtraction(
                feature=model_config['features_for_training'][0],
                prediction_window_size=model_config['prediction_window_size'],
                multiclass=multiclass,
                failure_modes=model_config['failure_modes']))

        # restore the original prediction output list
        model_config['predictions'] = predictions

def plot_predictions_by_asset(df_predictions, df_failures, start_ts, end_ts):
    """Plot predicted failure probability over time."""
    fig, axs = plt.subplots(5, 4, figsize=(25, 20))
    axs = [ax for sub_axs in axs for ax in sub_axs]
    for ax, asset_id in zip(axs, df_predictions.id.unique()):
        asset = asset_id[:11]
        # Plot predictions
        df_asset = df_predictions[df_predictions.id == asset_id]
        df_asset.custom_failure_probability_30d.plot(kind='line', linewidth=1, linestyle='-', marker='.', markersize=5, alpha=0.4, ax=ax)
        ax.set_ylim([-0.05, 1.05]); ax.set(xlabel="time", ylabel="probability", title=asset);
        # Plot failures
        df_asset_failures = df_failures[df_failures.asset_id == asset]
        data = [(df_asset.index[0], 0)]
        if df_asset_failures.shape[0] > 0:
            for f in df_asset_failures.timestamp_utc:
                if (f > df_asset.index[0]) and (f < df_asset.index[-1]):
                    data = data + [(f - dt.timedelta(hours=24), 0), (f, 1), (f + dt.timedelta(hours=24), 0)]
        data = data + [(df_asset.index[-1], 0)]
        sns.lineplot(data=pd.DataFrame(data).set_index(0)[1], ax=ax, color='r', alpha=0.6)
        
        ax.set_xlim([pd.Timestamp(start_ts), pd.Timestamp(end_ts)])
    # Draw and rotate axis labels
    fig.canvas.draw()
    for ax in axs:
        ax.set_xticklabels(ax.get_xticklabels(), rotation=25, ha="right");
        ax.legend().set_visible(False)
    plt.subplots_adjust(hspace=0.7);
    
def label_failures(df_predictions, df_failures, prediction_threshold=0.5):
    """
    Add the following columns to a dataframe of predictions:
        failure_prediction_30d    binary label of whether failure is predicted to occur in the next 30 days
        failure_30d               binary label of whether failure occurs within 30 days
    """
    df = df_predictions.copy().reset_index()
    df.id = df.id.apply(lambda x: x[:11])
    df['failure_prediction_30d'] = (df.custom_failure_probability_30d > prediction_threshold).astype(int)
    df = pd.merge(df, df_failures[['asset_id', 'timestamp_utc']], how='outer', left_on=['id', 'event_timestamp'], right_on=['asset_id', 'timestamp_utc'], indicator=True, sort=True)
    df.id = df.id.fillna(df.asset_id)
    for asset_id in df.id.unique():
        df.loc[df.id == asset_id, 'next_failure']    = df[df.id == asset_id].timestamp_utc.fillna(method='bfill')
        df.loc[df.id == asset_id, 'days_to_failure'] = (df[df.id == asset_id].next_failure - df[df.id == asset_id].event_timestamp).apply(lambda x: x.total_seconds() / (60*60*24))
        df.loc[df.id == asset_id, 'failure_30d']     = (df[df.id == asset_id].days_to_failure < 30).astype(int)
    else:
        df.loc[df.id == asset_id, 'next_failure']    = df[df.id == asset_id].timestamp_utc.iloc[0]
        df.loc[df.id == asset_id, 'days_to_failure'] = np.nan
        df.loc[df.id == asset_id, 'failure_30d']     = 0
    df = df.dropna(axis=0, how='any', subset=['event_timestamp', 'custom_failure_probability_30d'])
    df.failure_30d = df.failure_30d.astype(int)
    return df

def quantitative_evaluation(y_true, y_pred, y_pred_prob):
    """Print model evaluation statistics and graph the precision-recall curve."""
    cm = metrics.confusion_matrix(y_true, y_pred)
    if len(cm) == 1:
        cm = [[cm[0][0], 0], [0, 0]]
    df_cm = pd.DataFrame(cm, index=['Actual no', 'Actual yes'], columns=['Predicted no', 'Predicted yes'])
    display(df_cm)

    print('Precision:          {:,.1f}%'.format(metrics.precision_score(y_true, y_pred) * 100))
    print('Recall:             {:,.1f}%'.format(metrics.recall_score(y_true, y_pred) * 100))
    print('ROC AUC:            {:.2f}  '.format(metrics.roc_auc_score(y_true, y_pred_prob)))
    print('Average Precision:  {:.2f}  '.format(metrics.average_precision_score(y_true, y_pred_prob)))
    print('F1 score:           {:.2f}  '.format(metrics.average_precision_score(y_true, y_pred)))

    # Plot Precision-Recall Curve
    fig, ax = plt.subplots(figsize=(7,5))
    baseline_precision = y_true.sum() / y_true.shape[0]
    ax.plot([0, 1], [baseline_precision, baseline_precision], 'k', linestyle='--', label='Baseline (AP: {:.2f})'.format(baseline_precision));
    pr_curve = metrics.precision_recall_curve(y_true, y_pred_prob)
    ax.plot(pr_curve[1], pr_curve[0])
    ax.set_xlabel('Recall (True Positive Rate)', fontsize=12); ax.set_ylabel('Precision', fontsize=12); ax.set_title('Precision-Recall Curve', fontsize=14); #ax.legend(loc='upper right');
    ax.set_xlim([-.05,1.05]); ax.set_ylim([-.05,1.05]); ax.tick_params(labelsize=10);
    
def eval_predictions(df_predictions, df_failures, start_ts, end_ts):
    #plot_predictions_by_asset(df_predictions.reset_index('id'), df_failures, start_ts, end_ts)
    df_predictions_labelled = label_failures(df_predictions, df_failures)
    quantitative_evaluation(df_predictions_labelled.failure_30d, df_predictions_labelled.failure_prediction_30d, df_predictions_labelled.custom_failure_probability_30d)

def test(asset_group_id, iot_type, deviceid_prefix, assetid_prefix, siteid='BEDFORD'):
    from pmlib.failure_prediction import FailurePredictionAssetGroupPipeline
    
    asset_group_id = '1001'
    pump_sensor    = 'PUMP_SENSOR_DAILY'
    from srom.utils.no_op import NoOp
    from sklearn.ensemble import RandomForestClassifier

    from srom.utils.no_op import NoOp
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.decomposition import PCA

    feature_transformation_set = [
    ('skip_transformation', NoOp())
    ]
    feature_scaler_set = [
    ('skip_scaling', NoOp())
    ]
    feature_preprocessing_set = [
    #('skip_selection', NoOp()),
    ('pca', PCA(n_components=7, whiten=True))
    ]
    estimator_feature_generator_set = [
    ('skip_generation', NoOp())
    ]
    estimator_set = [
    ('randomforestclassifier', RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=2, min_samples_split=10, n_estimators=300, n_jobs=-1))
    ]

    my_stages = [
    
    estimator_set
    ]

    for stage in my_stages:
        print(stage)
    
    from pmlib.failure_prediction import FailurePredictionAssetGroupPipeline

    from pmlib.failure_prediction import FailurePredictionAssetGroupPipeline

    fp_pipeline = FailurePredictionAssetGroupPipeline(
    asset_group_id=asset_group_id, 
    model_pipeline={
        "features": [
            pump_sensor+':current_max_scaled',
            pump_sensor+':current_max_scaled_7d_max',
            pump_sensor+':current_std_scaled',
            pump_sensor+':daily_starts_scaled',
            pump_sensor+':rainfall_14d',
            pump_sensor+':rainfall_14d_max',
            pump_sensor+':rainfall_30d',
            pump_sensor+':rainfall_30d_max',
            pump_sensor+':runtime_max_scaled_7d_max',
            pump_sensor+':storage_level_max_scaled',
            pump_sensor+':storage_level_max_scaled_7d_max',
            pump_sensor+':storage_level_min_scaled',
            pump_sensor+':storage_level_min_scaled_7d_min',
            pump_sensor+':storage_level_std_scaled'
        ],
        "features_for_training": [":faildate"],                       # features used for labelling data
        "predictions": ["custom_failure_probability", "custom_rca_path"],   # model outputs
        "aggregation_methods": None,                               # feature engineering, options: "mean", "max", "min", "median", "std", "sum", "count"
        "prediction_window_size": "30d",                              # failure prediction window, e.g.: 4h, 15d, 30d, 60d, 90d
        #"rolling_window_size": "1d",                                  # rolling window for calculating summary statistics
        "override_training_stages": my_stages,
        "mode":"1",
        "scoring":"recall"
    }
    )

    train_start = '2017-07-01'
    train_end   = '2019-07-01'
    test_start  = '2019-07-01'
    test_end    = '2019-12-31'
    
    #df = fp_pipeline.execute()
    
    #df_train = fp_pipeline.execute(start_ts=train_start, end_ts=train_end)
    df_train = fp_pipeline.execute(start_ts=train_start, end_ts=train_end)
    #df_test = fp_pipeline.execute(start_ts=test_start, end_ts=test_end)

    #print(pmlib.log_df_info(df, head=0))

    

if __name__ == '__main__':
    test(asset_group_id='1001', iot_type='NodeMCU', assetid_prefix='BRAKE1205', siteid='BEDFORD', deviceid_prefix='Brake1205')

