# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""Anomaly Detection Model.

IoT sensors are pervasive in the manufacturing industries for monitoring assets. The data these sensors captured can be combined with advanced statistical and machine learning algorithms to yield insight on pending failures and optimal maintenance and operation strategies. A gas turbine, compressor, and boat engine are examples of valuable and costly assets that can benefit from such insights. These industrial assets fail due to many reasons, and often they behave in abnormal ways preceding the behaviour. In order to have an effective predictive maintenance program, it is crucial to be able to detect these abnormalities so that action can be taken before costly downtimes occur due to unexpected failures. However, identifying these telltale abnormalities before a failure event is difficult since, in general, failures are comparatively rare events so there is often a scarcity of tagged data for building predictive models. In the absence of abundant tagged (supervised) failure data, anomaly detection techniques as described in this notebook provide an unsupervised way of learning a model from data that mostly contains “normal” operating data with a sparse number of actual failure events.

This automated anomaly detection pipeline explores various anomaly detection techniques to identify an optimal one. The pipeline is divided into an anomaly trainer and anomaly evaluator: the anomaly model is first learned on the training data that is assumed to be normal. Normal data are selected from a time window that is far from the time were the failure occurred (i.e., the asset is operating normally). The trained model is applied on the validation data (mostly data coming from the pre-failure window in case of time series data) to obtain the anomaly validator. In anomaly validator, we calculate the performance of each anomaly model with respect to the anomaly label, where samples very near the failures are assigned anomaly label 1 while samples not-so-near the failures are assigned anomaly labbel 0.

In summary, a user must provide two datasets for anomaly learning:

1. Normal Data to train the anomaly model.
2. Validation Data to do anomaly model evaluation. Samples are from pre-failure windows close to failures.
"""

import logging

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import (DateOffset, Day, Hour, Minute, MonthBegin,
                                    MonthEnd, Second, Week)
from srom.anomaly_detection.generalized_anomaly_model import \
    GeneralizedAnomalyModel
from srom.pipeline.anomaly_pipeline import AnomalyPipeline
from srom.pipeline.srom_param_grid import SROMParamGrid
from srom.pipeline.utils.anomaly_utils import (estimator_comparison,
                                               get_threshold_statistics,
                                               retrive_base_learner)
from srom.utils.anomaly_dag import anomaly_dag

from .estimator import SromEstimator
from .pipeline import AssetGroupPipeline
from .transformer import (ContextTransformer, IdentifyPreFailureWindow,
                          TemporalRateChangeFeaturesTransformer,
                          TransformNotNaToEvent, _BaseTransformer)
from .unsupervised_anomaly_detection import (UnsupervisedADLearningAlgorithm,
                                             UnsupervisedAnomalyDetection)
from .util import log_df_info
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer

from srom.utils.no_op import NoOp


class AnomalyDetectionAssetGroupPipeline(AssetGroupPipeline):
    """Anomaly detection model pipeline.

    This `pmlib.pipeline.AssetGroupPipeline` sub-class implements the anomaly detection model.

    Trained anomaly detection model generates an anomaly score for each incoming event, along with
    an anomaly score threshold. You can choose the names of these outputs by key `predictions` of
    the dict parameter `model_pipeline` (`predictions` here is always a length-three array, of which the
    first element is the anomaly score output name, and the second element is the anomaly score
    threshold output name, the third element is if the anomaly is detected or not, and the order matters).

    As for the inputs to the model, you can use `features` of the dict parameter `model_pipeline` to
    specify the attributes from assets and/or IoT devices. As for the `features_for_training` of
    the dict parameter `model_pipeline`, it is required to be a length-one array containing a
    datetime attribute representing the asset failure history.

    Here's a typical example of how to create an object of this class:

    ```
    AnomalyDetectionAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
            'features_for_training': [':faildate'],
            'predictions': ['anomaly_score', 'anomaly_threshold','anomaly_detected'],
            'pre_failure_window_size': 20,
            'pre_failure_failure_size': 10,
            'srom_training_options': {
                'exectype': 'spark_node_random_search'
            }
        })
    ```

    This model has two special `model_pipeline` configuration for tweaking how to form the pre-failure
    windows for validation data set:

    * `pre_failure_window_size`: int, optional

        The size of pre-failure window, that is, how many events before a failure event should be
        included in pre-failure windows. Default is 20.

    * `pre_failure_failure_size`: int, optional

        Within each pre-failure window, the number of events immediately before the failure event to
        be assigned label 1 (failure), while all the rest events are assigned label 0. Default is 10.

    This model typically requires a lot of processing power and time to train. It is often necessary
    to use key `srom_training_options` of the dict parameter `model_pipeline` to tweak the processing
    power for training. In the example above, it choses a different `exectype`: `spark_node_random_search`
    to request to use a Spark environment (usually having multiple nodes) for training. If you don't
    have Spark environment, try `single_node_random_search` or `single_node_complete_search` for `exectype`.

    This model pipeline also addds a post-processing phase to generate a daily maximum anomaly score
    per asset. The name of this daily anomaly score is simply the specified anomaly score output name
    prefixed with 'daily_'.

    See base class for details on all the parameters available.
    """

    def __init__(self, **kwargs):
        if self.__class__ == AnomalyDetectionAssetGroupPipeline:
            kwargs['model_template_name'] = 'Anomaly Detection'
            kwargs['model_template_desc'] = None
        super().__init__(**kwargs)

    def default_summary(self, model_config):
        """This class generates by default a daily maximum anomaly score.

        See `pmlib.pipeline.AssetGroupPipeline.default_summary`.
        """
        unit_name = None
        #unit = type(to_offset(model_config['rolling_window_size'].lower()).base)
        #unit = type(to_offset(model_config['rolling_window_size']).base)

        #default rolling_window_size to Day

        try:
            unit = type(to_offset(model_config['global_sampling_interval']).base)
        except KeyError:
            unit = Day

        self.logger.debug('global_sampling_interval unit: %s', unit)
        if unit == Hour:
            unit_name = 'hourly'
        elif unit == Day:
            unit_name = 'daily'
        elif unit == Minute:
            unit_name = 'minute'
        else:
            #raise ValueError('invalid model_pipeline.global_sampling_interval=%s, unsupported offset alias, can only be one of min,H and D (case insensitive)' % model_config['global_sampling_interval'])
            # default to daily in case global_sampling_interval is not set for the MAS8.10
            unit_name = 'daily'

        predictions = model_config['predictions']
        
        summary = {}
        for idx, value in enumerate(model_config['predictions'][:]): 
            summary['${predictions[%s]}' % str(idx)] = {
                unit_name: {
                    'max': ('${granularity}_${data_item}', '${data_item}'),
                },
                
            }
        self.logger.debug('Compiled Anomaly Detection default_summary: %s', summary)
                
        return summary

    def prepare_model_config(self, model_config):
        """This class overrides this method to set the default value to the following two custom model pipeline
        configuration when not given in the constructor's parameter `model_pipeline`.

        * `pre_failure_window_size`: by default, if not given, set to be 20
        * `pre_failure_failure_size`: by default, if not given, set to be 10

        It also validates whether the result `pre_failure_failure_size` is equal to or smaller than
        `pre_failure_window_size`, and if not raises ValueError.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """

        if 'pre_failure_window_size' not in model_config:
            model_config['pre_failure_window_size'] = 20
        if 'pre_failure_failure_size' not in model_config:
            model_config['pre_failure_failure_size'] = 10

        if model_config['pre_failure_failure_size'] > model_config['pre_failure_window_size']:
            raise ValueError('model_pipeline.pre_failure_failure_size cannot be larger than model_pipeline.pre_failure_window_size')

        # if provide features_for_training, use it
        if model_config.get('features_for_training', None) is not None and len(model_config['features_for_training']) > 0:
            pass
        else:
            # if not provide features_for_training, infer it from model_config
            infered_features_for_training = [':faildate']

            # modify features_for_training
            model_config['features_for_training'] = [feature.replace(':', '') for feature in infered_features_for_training]

            # modify targets
            model_config['targets'] = [feature.replace(':', '') for feature in infered_features_for_training]

            # modify inputs
            # if model_config.get('inputs', None) is not None:
            inputs_list = list(model_config['inputs'])
            inputs_list.extend(infered_features_for_training)
            del model_config['inputs']
            model_config['inputs'] = tuple(inputs_list)

            # modify rename_inputs
            # if model_config.get('renamed_inputs', None) is not None:
            rename_input_list = list(model_config['renamed_inputs'])
            rename_input_list.extend([feature.replace(':', '') for feature in infered_features_for_training])
            del model_config['renamed_inputs']
            model_config['renamed_inputs'] = tuple(rename_input_list)

        #contextual anomaly
        if model_config.get('context_name', None) is not None:
            context_name = model_config.get('context_name', None)
            self.logger.debug('context_name: %s', context_name)
            prediction_list = []
            prediction_list.append('%s_%s' % (context_name.lower(),model_config['predictions'][0]))
            prediction_list.append('%s_%s' % (context_name.lower(),model_config['predictions'][1]))
            prediction_list.append('%s_%s' % (context_name.lower(),model_config['predictions'][2]))

            model_config['predictions'] = prediction_list

            self.logger.debug('Contextual anomaly predictions: %s', model_config['predictions'])

    def prepare_execute(self, pipeline, model_config):
        """This class overrides this method to use `pmlib.anomaly_detection.AnomalyDetectionEstimator` with
        two preprocessors for training.

        * `pmlib.transformer.TransformNotNaToEvent`
        * `pmlib.transformer.IdentifyPreFailureWindow`

        See `pmlib.pipeline.AssetGroupPipeline.prepare_execute`.
        """

        super().prepare_execute(pipeline, model_config)

        # Check for Unsupervised vs Semi-Supervised learning choice
        # use_labeled_data = model_config.get('use_labeled_data', True)

        # Add training preprocessing stages
        data_quality_advisor = model_config.get('data_quality_advisor',True)

        #model_config['minimum_acceptable_data_quality_score'] = model_config.get('minimum_acceptable_data_quality_score',0)


        if  model_config.get('context_expression',None)  is not None:
            #pipeline.add_stage(ContextTransformer(context_expression=model_config['context_expression']))
            if model_config.get('exclude_column_list',None)  is not None:
                self.logger.debug('exclude_column_list: %s', model_config.get('exclude_column_list',None))
                pipeline.add_stage(ContextTransformer(context_expression=model_config['context_expression'], exclude_column_list=model_config['exclude_column_list']))
                self.logger.debug('original model_config features: %s', model_config['features'])
                model_config['features'] =list( set (model_config['features']) - set(model_config['exclude_column_list']) )
                self.logger.debug('new features list: %s', model_config['features'])
            else:
                pipeline.add_stage(ContextTransformer(context_expression=model_config['context_expression'], exclude_column_list=None))


        if  model_config.get('aggregation_methods',None)  is not None and len(model_config.get('aggregation_methods',None)) > 0   and model_config.get('use_labeled_data', True) == False:
            # note that 'features' array is copied first to be passed to simple summary statistics transformer 
            # because later we will extend it for downstream stages with additional features generated by it. If 
            # we don't copy, then the same array (after extended) would be used as input by simple summary 
            # statistics transformer.
            
            self.logger.debug('will add TemporalRateChangeFeaturesTransform to the pipeline: %s', model_config['aggregation_methods'])

            # default rolling_window_size to '1D'
            pipeline.add_stage(TemporalRateChangeFeaturesTransformer(features=model_config['features'].copy(), aggregation_methods=model_config['aggregation_methods'], rolling_window_size=model_config.get('rolling_window_size','1D')))

            # add the generated satatistics into model_config['features']
            original_feature_list= model_config['features'].copy()
            self.logger.debug('original features: %s', original_feature_list)
            #model_config['features'].extend([(feature + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in model_config['aggregation_methods'] for feature in model_config['features']])
            #model_config['features'].extend([(feature + '__' + 'rate_of_change' + '__' + str(model_config['rolling_window_size']))  for feature in original_feature_list])

            #New code begin
            simple_aggregation, advanced_aggregation = self.get_new_feature_list(model_config['aggregation_methods'])
            if len(simple_aggregation) > 0 :
                model_config['features'].extend([(feature + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in simple_aggregation for feature in original_feature_list])
            if len(advanced_aggregation) > 0:
                model_config['features'].extend([(feature + '__' + aggr + '__' + str(model_config['rolling_window_size'])) for aggr in advanced_aggregation for feature in original_feature_list])
            #New code end
            
            self.logger.debug('new features: %s', model_config['features'])

        if model_config.get('use_labeled_data', True):
            self.logger.debug('Using semi-supervised learning estimator')
            estimator = AnomalyDetectionEstimator(**model_config)
        else:
            self.logger.debug('Using Unsupervised learning estimator')
            estimator = UnSupervisedAnomalyDetectionEstimator(**model_config)

        pipeline.add_stage(estimator)

        if model_config.get('use_labeled_data', True) and len(model_config['features_for_training']) > 0:
            if data_quality_advisor:
                estimator.add_training_preprocessor(DataQualityAdvisor(
                    model_config['features_for_training'][0],
                    minimum_acceptable_data_quality_score=model_config.get('minimum_acceptable_data_quality_score',0)
                ))
            else:
                self.logger.debug("Skipping Data Quality Advisor")

            estimator.add_training_preprocessor(TransformNotNaToEvent(model_config['features_for_training'][0]))
            estimator.add_training_preprocessor(IdentifyPreFailureWindow(model_config['features_for_training'][0], pre_failure_window_size=model_config['pre_failure_window_size'], pre_failure_failure_size=model_config['pre_failure_failure_size']))


    def get_new_feature_list(self, aggregation_methods ):
        self.logger.debug('begin of get_new_feature_list: %s', aggregation_methods)
        simple_aggregation_methods = ["mean", "max", "min", "median", "std", "sum", "count"]
        advanced_aggregation_methods = [
            "rate_of_change",
            "sum_of_change",
            "absoluate_sum_of_changes",
            "trend_slop",
            "abs_energy",
            "mean_abs_change",
            "mean_change",
            "mean_second_derivate_central",
            "count_above_mean",
            "count_below_mean",
            "last_location_of_maximum",
            "first_location_of_maximum",
            "corr_coefficient",
        ]
        
        self.logger.debug('simple_aggregation_methods: %s', simple_aggregation_methods)
        self.logger.debug('advanced_aggregation_methods: %s', advanced_aggregation_methods)
        simple_aggregation= []
        advanced_aggregation = []
        
        if set(aggregation_methods) <= set(simple_aggregation_methods):
            # all simple
            self.logger.debug('Using simple aggregation methods')
            simple_aggregation=aggregation_methods
        else:
            self.logger.debug('Advanced aggregation methods found')
            # all advanced
            if set (aggregation_methods) <= set (advanced_aggregation_methods):
                self.logger.debug('Using all adavanced aggregation methods')
                advanced_aggregation=aggregation_methods
            else:
                self.logger.debug('Using mixed aggregation methods')
            
                for i in aggregation_methods:
                    if i in simple_aggregation_methods:
                        simple_aggregation.append(i)
                    elif i in advanced_aggregation_methods:
                        advanced_aggregation.append(i)
                    else:
                        raise RuntimeError(' function in the aggregation_methods is not defined ',i)
        self.logger.debug('%s simple_aggregation methods: %s', len(simple_aggregation), simple_aggregation)
        self.logger.debug('%s advanced_aggregation: %s', len(advanced_aggregation), advanced_aggregation)
        
        return simple_aggregation, advanced_aggregation

class AnomalyDetectionEstimator(SromEstimator):
    """Anomaly detection estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the anomaly
    pipeline stages (composed of many algorithms).

    This class is a simple wrapper of the underlying SROM library.
    """

    def __init__(self, features, targets, predictions, **kwargs):
        super().__init__(features=features, targets=targets, predictions=predictions, **kwargs)
        self.anomaly_threshold = self.predictions[1]

    def create_pipeline(self):
        from srom.pipeline.anomaly_pipeline import AnomalyPipeline
        return AnomalyPipeline()

    def get_stages(self, df):
        from sklearn.covariance import (OAS, EllipticEnvelope,
                                        EmpiricalCovariance, LedoitWolf,
                                        MinCovDet, ShrunkCovariance)
        from sklearn.ensemble import IsolationForest, RandomForestClassifier
        from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
        from sklearn.neighbors import KernelDensity
        from sklearn.preprocessing import (MinMaxScaler, RobustScaler,
                                           StandardScaler)
        from sklearn.svm import OneClassSVM
        from srom.anomaly_detection.algorithms import (
            LOFNearestNeighborAnomalyModel, NearestNeighborAnomalyModel)
        from srom.anomaly_detection.algorithms.anomaly_ensembler import \
            AnomalyEnsembler
        from srom.anomaly_detection.algorithms.anomaly_robust_pca import \
            AnomalyRobustPCA
        from srom.anomaly_detection.algorithms.bayesian_gmm_outlier import \
            BayesianGMMOutlier
        from srom.anomaly_detection.algorithms.cusum import CUSUM
        from srom.anomaly_detection.algorithms.extended_isolation_forest import \
            ExtendedIsolationForest
        from srom.anomaly_detection.algorithms.extended_spad import \
            ExtendedSPAD
        from srom.anomaly_detection.algorithms.ggm_pgscps import GraphPgscps
        from srom.anomaly_detection.algorithms.ggm_quic import GraphQUIC
        from srom.anomaly_detection.algorithms.gmm_outlier import GMMOutlier
        from srom.anomaly_detection.algorithms.hotteling_t2 import HotellingT2
        from srom.anomaly_detection.algorithms.negative_sample_anomaly import \
            NSA
        from srom.anomaly_detection.algorithms.neural_network_nsa import \
            NeuralNetworkNSA
        from srom.anomaly_detection.algorithms.oob import OOB
        from srom.anomaly_detection.algorithms.pca_q import AnomalyPCA_Q
        from srom.anomaly_detection.algorithms.pca_t2 import AnomalyPCA_T2
        from srom.anomaly_detection.algorithms.random_partition_forest import \
            RandomPartitionForest
        from srom.anomaly_detection.algorithms.sample_svdd import SampleSVDD
        from srom.anomaly_detection.algorithms.spad import SPAD
        from srom.anomaly_detection.gaussian_graphical_anomaly_model import \
            GaussianGraphicalModel
        from srom.anomaly_detection.generalized_anomaly_model import \
            GeneralizedAnomalyModel
        # deep learning model
        from srom.deep_learning.anomaly_detector import DNNAutoEncoder
        from srom.utils.no_op import NoOp

        return [
            [
                ('skipscaling', NoOp()),
                ('standardscaler', StandardScaler()),
                ('robustscaler', RobustScaler()),
                ('minmaxscaling', MinMaxScaler())
            ],
            [
                # Rule/Density based Anomaly Models
                ('isolationforest', GeneralizedAnomalyModel(base_learner=IsolationForest(), predict_function='decision_function', score_sign=-1)),
                ('gaussianmixture', GeneralizedAnomalyModel(base_learner=GaussianMixture(), predict_function='score_samples', score_sign=1)),
                ('bayesiangaussianmixture', GeneralizedAnomalyModel(base_learner=BayesianGaussianMixture(), predict_function='score_samples', score_sign=1)),
                ('oneclasssvm', GeneralizedAnomalyModel(base_learner=OneClassSVM(), predict_function='decision_function', score_sign=1)),
                ('nearestneighboranomalymodel', GeneralizedAnomalyModel(base_learner=NearestNeighborAnomalyModel(), predict_function='predict', score_sign=1)),
                ('lofnearestneighboranomalymodel', GeneralizedAnomalyModel(base_learner=LOFNearestNeighborAnomalyModel(), predict_function='predict', score_sign=1)),
                ('anomalypca_t2', GeneralizedAnomalyModel(base_learner=AnomalyPCA_T2(), predict_function='anomaly_score',score_sign=1)),
                ('anomalypca_q', GeneralizedAnomalyModel(base_learner=AnomalyPCA_Q(), predict_function='anomaly_score',score_sign=1)),

                # Covariance Structure based Anomaly Models
                ('empiricalcovariance', GeneralizedAnomalyModel(base_learner=EmpiricalCovariance(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),
                ('ellipticenvelope', GeneralizedAnomalyModel(base_learner=EllipticEnvelope(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),
                ('ledoitwolf', GeneralizedAnomalyModel(base_learner=LedoitWolf(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),
                ('mincovdet', GeneralizedAnomalyModel(base_learner=MinCovDet(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),
                ('oas', GeneralizedAnomalyModel(base_learner=OAS(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),
                ('shrunkcovariance', GeneralizedAnomalyModel(base_learner=ShrunkCovariance(), fit_function='fit', predict_function='mahalanobis',score_sign=1)),

                # GaussianGraphicalModel
                ('ggm_default', GaussianGraphicalModel()),
                ('ggm_stochastic', GaussianGraphicalModel(distance_metric='Stochastic_Nearest_Neighbors')),
                ('ggm_kldiverse', GaussianGraphicalModel(distance_metric='KL_Divergence_Dist')),
                ('ggm_kldivergence', GaussianGraphicalModel(distance_metric='KL_Divergence')),
                ('ggm_frobenius', GaussianGraphicalModel(distance_metric='Frobenius_Norm')),
                ('ggm_likelihood', GaussianGraphicalModel(distance_metric='Likelihood')),
                ('ggm_spectral', GaussianGraphicalModel(distance_metric='Spectral')),
                ('ggm_mahalanobis_distance', GaussianGraphicalModel(distance_metric='Mahalanobis_Distance')),
                ('ggm_sparsest_k_subgraph', GaussianGraphicalModel(distance_metric='Sparsest_k_Subgraph')),

                #mixture
                ("bayesiangmmoutlier",GeneralizedAnomalyModel(base_learner=BayesianGMMOutlier(n_components=2, random_state=2),fit_function="fit",predict_function="decision_function",score_sign=-1,)),
                ("gmmoutlier",GeneralizedAnomalyModel(base_learner=GMMOutlier(random_state=2),fit_function="fit",predict_function="decision_function",score_sign=-1)),
                
                #ensemble
                ("nsarf",GeneralizedAnomalyModel(base_learner=NSA(scale=True,sample_ratio=0.95,sample_delta=0.5,base_model=RandomForestClassifier(random_state=2),anomaly_threshold=0.8,),fit_function="fit",predict_function="decision_function",score_sign=-1)),
                ("samplesvdd",GeneralizedAnomalyModel(base_learner=SampleSVDD(outlier_fraction=0.001, kernel_s=5),fit_function="fit",score_sign=1,predict_function="decision_function")),
                ("randompartitionforest",GeneralizedAnomalyModel(base_learner=RandomPartitionForest(anomaly_type="point_wise"),fit_function="fit",predict_function="decision_function",score_sign=-1,)),
                ("extendedisolationforest",GeneralizedAnomalyModel(base_learner=ExtendedIsolationForest(),fit_function="fit",predict_function="decision_function",score_sign=-1)),
                ("anomalyensembler",GeneralizedAnomalyModel(base_learner=AnomalyEnsembler(predict_only=True),fit_function="fit",predict_function="anomaly_score",score_sign=1)),
                ("anomalyrobustpca", GeneralizedAnomalyModel(base_learner=AnomalyRobustPCA(),fit_function="fit",predict_function="anomaly_score",score_sign=1)),
                #Neural network algorithm needs GPUs and takes a very long time to complete
                #("neuralnetworknsa", GeneralizedAnomalyModel(base_learner=NeuralNetworkNSA(scale=True, sample_ratio=25.0, sample_delta=0.05, batch_size=10, epochs=5, dropout=0.85, layer_width=150, n_hidden_layers=2,),fit_function="fit",predict_function="anomaly_score",score_sign=-1)),
                ("graphquic", GeneralizedAnomalyModel(GaussianGraphicalModel(base_learner=GraphQUIC(),sliding_window_size=0, scale=False),fit_function="fit",predict_function="predict",score_sign=1)),
                ("graphpgscps", GeneralizedAnomalyModel(GaussianGraphicalModel(base_learner=GraphPgscps(),sliding_window_size=0, scale=True),fit_function="fit",predict_function="predict",score_sign=1)),
                ("cusum", GeneralizedAnomalyModel(base_learner = CUSUM(), fit_function = 'fit', predict_function = "predict", score_sign = 1)),
                ("hotelling2",GeneralizedAnomalyModel(base_learner = HotellingT2(), fit_function = 'fit', predict_function = "score_samples", score_sign = 1)),
                ("kernel_density",GeneralizedAnomalyModel(base_learner = KernelDensity(), fit_function = 'fit', predict_function = "decision_function", score_sign = 1)),
                ("spad", GeneralizedAnomalyModel(base_learner = SPAD(), fit_function = 'fit', predict_function = "decision_function", score_sign = -1)),
                ("spad_extended",GeneralizedAnomalyModel(base_learner = ExtendedSPAD(), fit_function = 'fit', predict_function = "decision_function", score_sign = -1)),
                ("oob",GeneralizedAnomalyModel(base_learner = OOB(), fit_function = 'fit', predict_function = "decision_function", score_sign = 1)),
                ("dnn_autoencoder", GeneralizedAnomalyModel(base_learner=DNNAutoEncoder(), fit_function="fit",predict_function="predict",score_sign=1))
                
            ]
        ]

    def get_param_grid(self):        
        return SROMParamGrid(gridtype='anomaly_detection_fine_grid')

    def get_prediction_result_value_index(self):
        return (1, 0)

    def process_prediction_result(self, df, prediction_result, model):
        super().process_prediction_result(df, prediction_result, model)

        if prediction_result is not None:
            self.logger.debug('best_thresholds=%s', model.get_best_thresholds())
            df[self.anomaly_threshold] = model.get_best_thresholds()[0]
        else:
            df[self.anomaly_threshold] = None
        if len(self.predictions) > 2:
            df[self.predictions[2]] = df[self.predictions[0]] > df[self.predictions[1]]

        return df

class UnSupervisedAnomalyDetectionEstimator(SromEstimator):
    """Anomaly detection estimator.

    This sub-class estimator mainly overrides method `get_stages()` to construct the anomaly
    pipeline stages (composed of many algorithms).

    This class is a simple wrapper of the underlying SROM library.
    """

    def __init__(self, features, targets, predictions, **kwargs):
        super().__init__(features=features, targets=targets, predictions=predictions, **kwargs)
        self.anomaly_threshold = self.predictions[1]
        self.kwargs = kwargs
        self.features_for_training = None  # we do not need faildate for training for unsupervised AD

    def create_pipeline(self):
        from srom.pipeline.anomaly_pipeline import AnomalyPipeline
        return AnomalyPipeline()

    def get_stages(self, df):
        from srom.utils.anomaly_dag import anomaly_dag

        stages = []

        for algorithm in anomaly_dag:
            stages.append(
                (
                    algorithm[3],
                    GeneralizedAnomalyModel(
                    base_learner=algorithm[0],
                    fit_function="fit",
                    predict_function=algorithm[2],
                    score_sign=algorithm[1],
                    ),
                )
            )
        return stages
        

    def get_param_grid(self):        
        return SROMParamGrid(gridtype='anomaly_detection_fine_grid')

    def get_prediction_result_value_index(self):
        return (1, 0)
    
    def get_df_for_training(self, df):
        self.logger.debug('begin of get_df_for_training df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return super().get_df_for_training(df)

    def process_prediction_result(self, df, prediction_result, model):
        super().process_prediction_result(df, prediction_result, model)

        if prediction_result is not None:
            best_threshold_value = model.get_best_thresholds()[0]
            if best_threshold_value != None and (isinstance(best_threshold_value, list) or \
                                isinstance(best_threshold_value, np.ndarray)) and len(best_threshold_value) > 0:
                best_threshold_value = best_threshold_value[0]
            self.logger.debug('best_thresholds=%s', best_threshold_value)
            df[self.anomaly_threshold] = best_threshold_value
        else:
            df[self.anomaly_threshold] = None
        if len(self.predictions) > 2:
            df[self.predictions[2]] = df[self.predictions[0]] > df[self.predictions[1]]

        return df

    
    
    ### Handle the following function in the base class, converging all the computation logic in one place. github issue https://github.ibm.com/maximo/Asset-Health-Insight/issues/15287
    
    def train_model(self, df):
        self.logger.debug('Pre trained model instance = %s', str(self.pre_trained_model))
        if self.pre_trained_model is not None:
          self.logger.debug('Since pre-trained model is available, training is being skipped here')
          return self.pre_trained_model   # Caution - in the else branch below, on the final statement of this function, pipeline_output is returned. Need for refactoring.
         
        self.logger.info('No pretrained model is available, so beginning a new model training using Unsupervised Learning DAG')  
        # There could be NaNs in the first row for features such as rate of change or std
        df.dropna(inplace=True)
        
        pipeline, best_execution_res, execution_res = None, None, None
        anomaly_threshold, class_distribution = None, None
        threshold_computation_strategy = self.kwargs.get('threshold_computation_strategy', 'default')
        qfunction_threshold = self.kwargs.get('qfunction_threshold', 0.1)
        medianabsolutedev_threshold = self.kwargs.get('medianabsolutedev_threshold', 2.5)
        contamination = self.kwargs.get('contamination', None)
        scoring_strategy = self.kwargs.get('scoring_strategy', 'em_score')
        execution_type = self.kwargs.get('execution_type','spark_node_random_search')
        number_of_option_per_pipeline = self.kwargs.get('number_of_option_per_pipeline',1)
        # maximum_evaluation_time default to 5 minutes
        maximum_evaluation_time = self.kwargs.get('maximum_evaluation_time',5)

        # set default total_evaluation_time to be 10 minutes
        total_execution_time = self.kwargs.get('total_execution_time',10)
        self.logger.debug('total_execution_time: %s', total_execution_time)


        random_state = self.kwargs.get('random_state',42)

        scaling  = self.kwargs.get('feature_scaling', 'skip')
        self.logger.debug('scaling: %s', scaling)

        # use can set it to (-1,1)
        feature_normalize_range  = self.kwargs.get('feature_normalize_range', (0,1))

        if  feature_normalize_range [0] >= feature_normalize_range [1]  :
            raise Exception(' feature_normalize_range[0] must be less than feature_normalize_range[1]. For example (3,2)  is invalid.')

        #scoring_strategy validation
        self.logger.debug('scoring_strategy: %s', scoring_strategy)
        
        if scoring_strategy not in ('mv_score','em_score','al_score'):
            raise Exception('scoring_strategy must be either mv_score, or em_score, or al_score')

        self.logger.debug('Threshold Computation Strategy = %s', threshold_computation_strategy)
        self.logger.debug('qfunction threhsold = %s', qfunction_threshold)
        self.logger.debug('median absolute deviation threshold = %s', medianabsolutedev_threshold)
        self.logger.debug('contamination = %s', contamination)

        #Pipeline creation and execution

        # get anomaly_threshold_method, std_threshold
        anomaly_threshold_method = self.kwargs.get('anomaly_threshold_method', 'std')
        std_threshold = self.kwargs.get('std_threshold',2.0 )
        self.logger.debug('anomaly_threshold_method = %s', anomaly_threshold_method)
        self.logger.debug('std_thresholdd = %s', std_threshold)




        '''
        set anomaly threshold
        
            Std
            Contamination
            Adaptive-Contamination
            QFunction
            MedianAbsoluteDev
            Otsu
        
        '''
           

        if  anomaly_threshold_method == 'std':
            pipeline = AnomalyPipeline(anomaly_threshold_method, std_threshold)
        elif anomaly_threshold_method == 'contamination':
            pipeline = AnomalyPipeline(anomaly_threshold_method, contamination)        
        elif anomaly_threshold_method == 'qfunction':
            pipeline = AnomalyPipeline(anomaly_threshold_method, qfunction_threshold)   
        elif anomaly_threshold_method == 'medianabsolutedev':
            pipeline = AnomalyPipeline(anomaly_threshold_method, medianabsolutedev_threshold) 
        elif anomaly_threshold_method == 'adaptivecontamination':
            pipeline = AnomalyPipeline(anomaly_threshold_method, contamination) 
        elif anomaly_threshold_method == 'otsu':
            pipeline = AnomalyPipeline(anomaly_threshold_method, std_threshold)  
        else:
            # default to 'std'
            pipeline = AnomalyPipeline(anomaly_threshold_method, std_threshold)
        
        
        

        pipeline.set_scoring(scoring_strategy)

        
        estimator_list = self.get_stages(df)
        

        if scaling   == 'standardize':
            stages  = [
                        [
                                ('standardscaler', StandardScaler())
                         ],
                        estimator_list
                        ]
        elif scaling == 'normalize':
            stages =  [
                         [
                            ('minmaxscaling', MinMaxScaler(feature_range=feature_normalize_range))
                        ],
                        estimator_list
                        ]
        else :
            stages = [
                    [
                        ('skipscaling', NoOp())
                    ],
                    estimator_list
                    ]
        


        if self.kwargs.get('override_training_stages',None) is not None:
            pipeline.set_stages(self.kwargs.get('override_training_stages',None))
        else:
            pipeline.set_stages(stages)
        
        self.logger.info('Final pipeline = %s', pipeline)
        
        fine_param_grid = SROMParamGrid(gridtype='anomaly_detection_fine_grid')
        self.logger.debug('Training dataset: %s', df[self.features].head(5).values)

        #df.to_csv('/project_data/data_asset/df_before_train_unsupervised_ad.csv')
        
        pipeline_output = pipeline.execute(trainX=df[self.features].values,
                                   validX=None, validy=None, 
                                   verbosity='low', 
                                   param_grid=fine_param_grid, 
                                   exectype=execution_type, 
                                   num_option_per_pipeline=number_of_option_per_pipeline, 
                                   max_eval_time_minute=maximum_evaluation_time,
                                   total_execution_time = total_execution_time,
                                   random_state=random_state)
        
        self.logger.debug('Training best estimator: %s', pipeline.get_best_estimator())
        
        return pipeline_output

class DataQualityAdvisor(_BaseTransformer):
    """This DataQualityAdvisor checks the data quality.

    """

    def __init__(self, features, minimum_acceptable_data_quality_score = 0, **kwargs):
        """
        Parameters
        ----------
        feature : `str`
            The failure event feature to be transformed.
        """

        super().__init__()

        self.features = features
        self.minimum_acceptable_data_quality_score = minimum_acceptable_data_quality_score
        self.sensor_time_stamp_column_name = kwargs.get('sensor_time_stamp_column_name', 'evt_timestamp')
        self.sensor_asset_id_column_name = kwargs.get('sensor_asset_id_column_name', 'id')


    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Beginning DataQualityAdvisor df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        self.logger.debug('DataQualityAdvisor minimum acceptable data quality score = %s', self.minimum_acceptable_data_quality_score)

        # get sensor data
        #sensor_table = df.copy()
        x_df = df.reset_index(inplace = False, drop = False)

        # sensor table metadata
        #sensor_table.reset_index(inplace = True)

        training_features = list(x_df.columns)
        #sensor_asset_col = 'id'
        #sensor_timestamp_col = self.sensor_time_stamp_column_name

        #x_df[self.sensor_asset_id_column_name] = x_df[self.sensor_asset_id_column_name].astype('str')
        #x_df[self.sensor_time_stamp_column_name] = x_df[self.sensor_time_stamp_column_name].astype('str')
        
        del training_features[training_features.index(self.sensor_asset_id_column_name)]
        del training_features[training_features.index(self.sensor_time_stamp_column_name)]
        del training_features[training_features.index(self.features)]
        #data_col = FPA_train_clms
        #timestamp_col = self.sensor_time_stamp_column_name

        self.logger.debug('Features excluded from the DataQualityAdvisor check = %s', self.features)
        self.logger.debug('Training features considered for DataQualityAdvisor=%s', training_features)

        from dqlearn.pipeline.anomaly import AnomalyDetectionDataQuality

        AnomalyDQ = AnomalyDetectionDataQuality(params={
            'data_col':training_features,
            'timestamp_col':self.sensor_time_stamp_column_name
        })

        #sensor_table_new = sensor_table[ [timestamp_col]+FPA_train_clms]


        #sensor_table_new[sensor_timestamp_col] = sensor_table_new[sensor_timestamp_col].astype('str')
        x_df = x_df[training_features+[self.sensor_time_stamp_column_name]] # Just overwrite the data frame to avoid proliferation of copies taking up memory
        #x_df[self.sensor_time_stamp_column_name] = x_df[self.sensor_time_stamp_column_name].astype(str)
        self.logger.debug('Data types of the features before starting the DataQualityAdvisor for anomaly detection = %s', {col: str(dtype) for col, dtype in dict(x_df.dtypes).items()})

        self.logger.info('Running sensor data quality. This prints out entire DataFrames to stdout and ignores the log level.')
        data_quality_pipeline = AnomalyDQ.sensor_data_quality(x_df)

        self.logger.info('Printing DataQualityAdvisor result report. This report will print regardless of log level and can\'t be changed. Report:')        
        data_quality_pipeline.result_report()

        data_quality_report=data_quality_pipeline.report.report

        data_quality_score=data_quality_report.get('score',0)

        if data_quality_score >= self.minimum_acceptable_data_quality_score:
            self.logger.info('Data Quality checks for Anomaly Detection training features passed the threshold minium score required = %s, score computed by data quality advisor = %3.2f', self.minimum_acceptable_data_quality_score, data_quality_score)
        else:
            raise RuntimeError(f'Training pipeline flow stopped because the data quality score for anomaly detection {data_quality_score} is less than the minimum required {self.minimum_acceptable_data_quality_score}. Either try to lower the expected score or clean up the data.')

        return df




