# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""This module includes reusabble transformers as data preprocessors in pipelines."""

from asyncio.log import logger
from cmath import log
from distutils.log import debug
from os import sep
import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import Day, Hour, Minute, Second
import logging

#from srom.feature_engineering.timeseries.rolling_window_feature_extraction import simple_summary_statistics
from srom.feature_engineering.timeseries.rolling_window_feature_extraction import simple_summary_statistics, \
                                advance_summary_statistics, rate_of_change
#from dqlearn.missing.missing_pattern import check_missing_properties
from srom.auto.auto_imputation import AutoImputation

import iotfunctions.base

from .util import get_logger, log_df_info
import json


class _BaseTransformer(iotfunctions.base.BaseTransformer):
    """Base transformer class acting merely as a central place for docstring placement.

    All transformers should inherit from this class and add their own class docstring.
    """

    def __init__(self):
        super().__init__()
        self.df_traces = dict()
        self.logger = get_logger(self)

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        """Execute the transformation.

        Parameters
        ----------
        df : `DataFrame`
            The input dataframe.
        start_ts : datetime-like, `str`, `int`, `float`, optional
            The starting timestamp of the range of data to be processed, parsable by `pd.Timestamp`. Default is None.
        end_ts : datetime-like, `str`, `int`, `float`, optional
            The ending timestamp of the range of data to be processed, parsable by `pd.Timestamp`. Default is None.
        entities : `list` of `str`, optional
            The entities to be processed. Entities not in this list are ignored. Default is None, which means not filtering.
        """
        self.df_traces = dict()
        return df


class SimpleSummaryStatistics(_BaseTransformer):
    """This transformer generates simple summary statistics.

    The features of which the summary statistics to be generated are given. This transformer 
    also takes in a list of summary aggregation methods and the rolling window size. The new statistics 
    features are generated based on those parameters and appended to the original dataframe. The names 
    of those generated statistics features have a fixed format: `<base_column>_<agg_method>_<window_size>`.

    The generated statistics are appended as new features.

    Note that for beginning rows in the data set which do not have a complete rolling window are dropped 
    before returning the dataframe. This is so because their rolling summary values are not exactly 
    correct.

    One implication of this beginning-rows-without-complete-window dropping is when you train with 
    specific time range history (that is, give parameters `start_ts` and/or `end_ts` when calling 
    `execute()` method), you might want to adjust your `start_ts` to move it earlier in time a little 
    bit so you can get the desired range of history. For example, with a '5d' rolling window, you 
    can move your `start_ts` 5 days earlier to make sure the starting point are not dropped.
    """

    def __init__(self, features, aggregation_methods, rolling_window_size):
        """
        Parameters
        ----------
        features : `list` of `str`
            The features to generate summary statistics.
        aggregation_methods : `list` of `str`
            The list of aggregation methods to use to generate summary statistics. The valid methods are: 
            {'mean', 'max', 'min', 'median', 'std', 'sum', 'count'}.
        rolling_window_size : `str`
            The size of the rolling window to generate summary statistics, specified in 
            [Pandas offset alias](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) 
            and can be only multiple of second ('S'), minute ('T'), hour ('H'), or day ('D').
        """

        super().__init__()

        if features is None or not isinstance(features, list) or len(features) == 0:
            raise ValueError('parameter features must be an non-empty list of strings')
        self.features = features

        if aggregation_methods is None or not isinstance(aggregation_methods, list) or len(aggregation_methods) == 0:
            raise ValueError('parameter aggregation_methods must be an non-empty list of strings')
        self.aggregation_methods = aggregation_methods

        if rolling_window_size is None or not isinstance(rolling_window_size, str):
            raise ValueError('parameter rolling_window_size must be a string')
        try:
            base = to_offset(rolling_window_size)
            if type(base) not in [Second, Minute, Hour, Day]:
                raise ValueError("parameter rolling_window_size must be a multiple of second ('S'), minute ('T'), hour ('H'), or day ('D')")
        except ValueError as e:
            raise ValueError('invalid rolling_window_size=%s: %s' % (rolling_window_size, e))
        self.rolling_window_size = rolling_window_size

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Executing Simple Summary Statistics with args: start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        df_original = df

        # pick only needed columns for scoring 
        df = df[list(set(self.features) - set(df.index.names))]

        sources_not_in_column=df.index.names
        df = df.reset_index()

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp

        df = simple_summary_statistics(df=df, 
                                       aggregation_methods=self.aggregation_methods, 
                                       rolling_window_size=self.rolling_window_size, 
                                       variable_clms=self.features, 
                                       asset_id_clm=df_index_id_name, 
                                       date_clm=df_index_timestamp_name, 
                                       date_clm_format='%Y-%m-%d', # need to be provided, fixed, but not actually used
                                       min_periods=None, 
                                       aggregation_type='time')

        simple_summary_statistics_id_name, simple_summary_statistics_timestamp_name = 'asset_id', 'datetime'
        df = df.rename(columns={
            simple_summary_statistics_id_name: df_index_id_name, 
            simple_summary_statistics_timestamp_name: df_index_timestamp_name
        }) 

        df[df_index_timestamp_name] = df[df_index_timestamp_name].astype('datetime64[ms]')

        # drop original features, non-statistics columns
        to_drop = set(df.columns) & set(self.features + [df_index_id_name, df_index_timestamp_name])
        df = df.drop(columns=to_drop)

        # need the stats column names added for later dropna usage
        added_stats_columns = list(df.columns)

        # concat horizontally the new statistics columns to the original df
        df = pd.concat([df_original.reset_index(), df], axis=1)
        self.logger.debug('Original DataFrame concatenated with statistics: %s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))

        # find time window day range, filter the df to remain only right result
        if start_ts is None:
            start_ts = pd.Timestamp(df[df_index_timestamp_name].iloc[0])
        elif isinstance(start_ts, str):
            start_ts = pd.Timestamp(start_ts)
        offset = to_offset(self.rolling_window_size)
        self.logger.debug('Filtering time window by start time and rolling window size: (start_ts=%s) + (offset=%s) = (cut-off=%s)', start_ts, offset, start_ts + offset)
        self.logger.debug('DataFrame before filtering by rolling window size: %s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))
        df = df[df[df_index_timestamp_name] >= start_ts + offset]
        self.logger.debug('Filtered DataFrame by rolling window size: %s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))

        # TODO how do we handle empty dataframe after filtering out not enough rolling time window data?

        df = df.set_index(keys=sources_not_in_column)

        df = df.dropna(subset=self.features + added_stats_columns)

        self.logger.debug('Final Dataframe: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return df


class TransformNotNaToEvent(_BaseTransformer):
    """This transformer transforms the given feature into event based on whether the feature value is NA.

    Non-NA values are transformed to be the given event label and NA values are remained NA.
    """

    def __init__(self, feature, event_label=1):
        """
        Parameters
        ----------
        features : `str`
            The feature to be transformed to event.
        """

        super().__init__()

        if feature is None or not isinstance(feature, str) or len(feature) == 0:
            raise ValueError('parameter feature must be an non-empty string')
        self.feature = feature
        self.event_label = event_label

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Input DataFrame=%s', log_df_info(df[pd.notna(df[self.feature])], head=5, logger=self.logger, log_level=logging.DEBUG))

        df[self.feature] = np.where(pd.notna(df[self.feature]), self.event_label, np.nan)

        self.logger.debug('Output DataFrame=%s', log_df_info(df[pd.notna(df[self.feature])], head=5, logger=self.logger, log_level=logging.DEBUG))

        return df


class IdentifyPreFailureWindow(_BaseTransformer):
    """This transformer expands a failure event feature based on the given pre-failure window information.

    A pre-failure window is a time-series window immediately before a failure event happens. Within the window, 
    data points are further grouped into two sets, one failure set and the other normal set. Failure set in 
    the one closer to the failure event, in time order.

    You specify the size of this pre-failure window as well as the size of the failure set within the window 
    when creating this transformer.

    The given failure event is transformed into a 3-state event: 1 for failures within any pre-failure window, 
    0 for non-failures within any pre-failure window, and NA for all those outside of any pre-failure window.
    """

    def __init__(self, feature, pre_failure_window_size, pre_failure_failure_size):
        """
        Parameters
        ----------
        features : `str`
            The failure event feature to be transformed.
        pre_failure_window_size : int
            The size of the pre-failure window, number of time-series data points.
        pre_failure_failure_size : int
            The number of data points, immediately before the failure event, to be labled also as failures 
            within the pre-failure window.
        """

        super().__init__()

        if feature is None or not isinstance(feature, str) or len(feature) == 0:
            raise ValueError('parameter feature must be an non-empty string')
        self.feature = feature

        if pre_failure_window_size is None or not isinstance(pre_failure_window_size, int) or pre_failure_window_size <= 0:
            raise ValueError('parameter pre_failure_window_size must be a positive integer')
        self.pre_failure_window_size = pre_failure_window_size

        if pre_failure_failure_size is None or not isinstance(pre_failure_failure_size, int) or pre_failure_failure_size <= 0:
            raise ValueError('parameter pre_failure_failure_size must be a positive integer')
        self.pre_failure_failure_size = pre_failure_failure_size

        if self.pre_failure_window_size < self.pre_failure_failure_size:
            raise ValueError('parameter pre_failure_window_size must not be smaller than pre_failure_failure_size')

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Input DataFrame=%s', log_df_info(df[pd.notna(df[self.feature])], head=5))

        self.logger.debug('pre_failure_window_size=%s, pre_failure_failure_size=%s', self.pre_failure_window_size, self.pre_failure_failure_size)

        # first reverse the rows
        df = df[::-1]

        pre_failure_normal = []
        pre_failure_failure = []
        current_count = -1
        for row in df.itertuples():
            row_dict = row._asdict()
            if pd.notna(row_dict[self.feature]):
                current_count = 0
            elif current_count >=0:
                current_count += 1
                if current_count <= self.pre_failure_failure_size:
                    pre_failure_failure.append(row.Index)
                elif current_count <= self.pre_failure_window_size:
                    pre_failure_normal.append(row.Index)
                else:
                    current_count = -1

        df.loc[pre_failure_failure, [self.feature]] = 1
        df.loc[pre_failure_normal, [self.feature]] = 0

        # reverse back the rows
        df = df[::-1]

        self.logger.debug('Output DataFrame=%s', log_df_info(df[pd.notna(df[self.feature])], head=5))

        return df


class TemporalRateChangeFeaturesTransformer(_BaseTransformer):
    """This transformer generates simple summary statistics and rate of change.

    The features of which the summary statistics to be generated are given. This transformer 
    also takes in a list of summary aggregation methods and the rolling window size. The new statistics 
    features are generated based on those parameters and appended to the original dataframe. The names 
    of those generated statistics features have a fixed format: `<base_column>_<agg_method>_<window_size>`.

    The generated statistics are appended as new features.

    Note that for beginning rows in the data set which do not have a complete rolling window are dropped 
    before returning the dataframe. This is so because their rolling summary values are not exactly 
    correct.

    
    """
    def __init__(self, features, aggregation_methods, rolling_window_size):
        """
        Parameters
        ----------
        features : `list` of `str`
            The features to generate summary statistics.
        aggregation_methods : `list` of `str`
            The list of aggregation methods to use to generate summary statistics. The valid methods are: 
            {'mean', 'max', 'min', 'median', 'std', 'sum', 'count'}.
        rolling_window_size : `str`
            The size of the rolling window to generate summary statistics, specified in 
            [Pandas offset alias](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) 
            and can be only multiple of second ('S'), minute ('T'), hour ('H'), or day ('D').
        """

        super().__init__()

        if features is None or not isinstance(features, list) or len(features) == 0:
            raise ValueError('parameter features must be an non-empty list of strings')
        self.features = features

        if aggregation_methods is None or not isinstance(aggregation_methods, list) or len(aggregation_methods) == 0:
            raise ValueError('parameter aggregation_methods must be an non-empty list of strings')
        self.aggregation_methods = aggregation_methods

        if rolling_window_size is None or not isinstance(rolling_window_size, str):
            raise ValueError('parameter rolling_window_size must be a string')
        try:
            base = to_offset(rolling_window_size)
            if type(base) not in [Second, Minute, Hour, Day]:
                raise ValueError("parameter rolling_window_size must be a multiple of second ('S'), minute ('T'), hour ('H'), or day ('D')")
        except ValueError as e:
            raise ValueError('invalid rolling_window_size=%s: %s' % (rolling_window_size, e))
        self.rolling_window_size = rolling_window_size
        
        
        
    def compute_temporal_features(self,df,simple_aggregation_methods,start_ts=None, end_ts=None):
        
        
        
        self.logger.debug('begin of compute_temporal_features  df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('simple_aggregation_methods=%s', simple_aggregation_methods)

        self.logger.debug('before change Features and metadata columns before processing the input data frame %s %s', self.features, df.index.names)
        #df = df[list(set(self.features) - set(df.index.names))]
        df = df.reset_index(drop=False)

        self.logger.debug('after change Features and metadata columns before processing the input data frame %s %s', self.features, df.index.names)
        
        #df = df.reset_index()

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp

        feature_df = simple_summary_statistics(
            df=df, 
            aggregation_methods=simple_aggregation_methods, 
            rolling_window_size=self.rolling_window_size, 
            variable_clms=self.features, 
            asset_id_clm=df_index_id_name, 
            date_clm=df_index_timestamp_name, 
            date_clm_format='%Y-%m-%d %H:%M:%S', # need to be provided, fixed, but not actually used
            min_periods=None, 
            aggregation_type='time'
        )
        
        self.logger.debug('Shape of the data frame after feature creation = %s', df.shape)
        feature_df.rename({'asset_id':df_index_id_name, 'datetime':df_index_timestamp_name}, inplace = True, axis = 1)
        feature_df[df_index_timestamp_name] = pd.to_datetime(feature_df[df_index_timestamp_name], format = '%Y-%m-%d %H:%M:%S')
        
        feature_df = feature_df.set_index(keys=[df_index_id_name,df_index_timestamp_name])
        
        self.logger.debug('end of compute_temporal_features  feature_df=%s', log_df_info(feature_df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return feature_df
    
    
    def compute_rate_change(self, df,advanced_aggregation_methods,start_ts=None, end_ts=None):
        self.logger.debug('begin of compute_rate_change df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('advanced_aggregation_methods=%s', advanced_aggregation_methods)

        # pick only needed columns for scoring 
        #df = df[list(set(self.features) - set(df.index.names))]

        #sources_not_in_column=df.index.names
        
        df = df.reset_index(drop=False)

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp
        
        rate_change_df = advance_summary_statistics(df, self.rolling_window_size, self.features,\
                     df_index_id_name, df_index_timestamp_name, '%Y-%m-%d %H:%M:%S', None, \
                     aggregation_type = 'time', aggregation_methods = advanced_aggregation_methods)
        self.logger.debug('Shape of data frame after rate change = %s', rate_change_df.shape)
        rate_change_df.rename({'asset_id':df_index_id_name, 'datetime':df_index_timestamp_name}, inplace = True, axis = 1)
        rate_change_df[df_index_timestamp_name] = pd.to_datetime(rate_change_df[df_index_timestamp_name], format = '%Y-%m-%d %H:%M:%S')
        rate_change_df = rate_change_df.set_index(keys=[df_index_id_name,df_index_timestamp_name])
        
        self.logger.debug('begin of compute_rate_change  rate_change_df=%s', log_df_info(rate_change_df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return rate_change_df
    
    
    def compute_temporal_rate_change_features(self, df,simple_aggregation_methods, advanced_aggregation_methods,start_ts=None, end_ts=None) :
        self.logger.debug('begin of compute_temporal_rate_change_features execute() df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('simple_aggregation_methods=%s', simple_aggregation_methods)
        self.logger.debug('advanced_aggregation_methods=%s', advanced_aggregation_methods)

        # pick only needed columns for scoring 
        df = df[list(set(self.features) - set(df.index.names))]

        sources_not_in_column=df.index.names
        
        df = df.reset_index()

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp

        feature_df = simple_summary_statistics(df=df, 
                                       aggregation_methods=simple_aggregation_methods, 
                                       rolling_window_size=self.rolling_window_size, 
                                       variable_clms=self.features, 
                                       asset_id_clm=df_index_id_name, 
                                       date_clm=df_index_timestamp_name, 
                                       date_clm_format='%Y-%m-%d %H:%M:%S', # need to be provided, fixed, but not actually used
                                       min_periods=None, 
                                       aggregation_type='time')

        
        
        print('Shape of the data frame after feature creation = ', str(df.shape))
        feature_df.rename({'asset_id':df_index_id_name, 'datetime':df_index_timestamp_name}, inplace = True, axis = 1)
        feature_df[df_index_timestamp_name] = pd.to_datetime(feature_df[df_index_timestamp_name], format = '%Y-%m-%d %H:%M:%S')
        print(feature_df.columns.values)
        #display(feature_df.head())
        rate_change_df = advance_summary_statistics(feature_df, self.rolling_window_size, self.features,\
                     df_index_id_name, df_index_timestamp_name, '%Y-%m-%d %H:%M:%S', None, \
                     aggregation_type = 'time', aggregation_methods = advanced_aggregation_methods)
        print('Shape of data frame after rate change = ', str(rate_change_df.shape))
        rate_change_df.rename({'asset_id':df_index_id_name, 'datetime':df_index_timestamp_name}, inplace = True, axis = 1)
        print(rate_change_df.columns.values)
        
        rate_change_features = rate_change_df.columns.values.tolist()
        rate_change_features = list( set(rate_change_features) - set([df_index_timestamp_name, df_index_id_name] + self.features))
        for rcf in rate_change_features:
            feature_df[rcf] = rate_change_df[rcf]
        
        print('Shape of data frame after combining the results = ', str(feature_df.shape))
        print(feature_df.columns.values)
        
        feature_df = feature_df.set_index(keys=sources_not_in_column)
        
        #feature_df = feature_df.dropna(subset=self.features + added_stats_columns)
        self.logger.debug('final feature_df=%s', log_df_info(feature_df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return feature_df    
        

    
    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('begin of TemporalRateChangeFeaturesTransform execute() start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('aggregation_methods=%s', self.aggregation_methods)
        
        simple_aggregation_methods = ["mean", "max", "min", "median", "std", "sum", "count"]
        advanced_aggregation_methods = [
            "rate_of_change",
            "sum_of_change",
            "absoluate_sum_of_changes",
            "trend_slop",
            "abs_energy",
            "mean_abs_change",
            "mean_change",
            "mean_second_derivate_central",
            "count_above_mean",
            "count_below_mean",
            "last_location_of_maximum",
            "first_location_of_maximum",
            "corr_coefficient",
        ]
        
        print(simple_aggregation_methods)
        print(advanced_aggregation_methods)
        simple_aggregation= []
        advanced_aggregation = []
        
        if set(self.aggregation_methods) <= set(simple_aggregation_methods):
            # all simple
            print('use simple')
            simple_aggregation=self.aggregation_methods
        else:
            print('has advanced')
            # all advanced
            if set (self.aggregation_methods) <= set (advanced_aggregation_methods):
                print('all advancced')
                advanced_aggregation=self.aggregation_methods
            else:
                print('mixed')
            
                for i in self.aggregation_methods:
                    print(i)
                    if i in simple_aggregation_methods:
                        simple_aggregation.append(i)
                    elif i in advanced_aggregation_methods:
                        advanced_aggregation.append(i)
                    else:
                        raise RuntimeError(' function in the aggregation_methods is not defined ',i)
        print('simple_aggregation',simple_aggregation,len(simple_aggregation))
        print('advanced_aggregation',advanced_aggregation,len(advanced_aggregation))
        
        if len(simple_aggregation) > 0 and len(advanced_aggregation) == 0:
            return self.compute_temporal_features(df,simple_aggregation,start_ts, end_ts)
        elif len(simple_aggregation) == 0 and len(advanced_aggregation) > 0 :
            return self.compute_rate_change(df,advanced_aggregation,start_ts, end_ts)
        elif  len(simple_aggregation) > 0 and len(advanced_aggregation) > 0 :
            return self.compute_temporal_rate_change_features(df,simple_aggregation, advanced_aggregation,start_ts, end_ts)
        else:
            return df
        


class MissingValueAnalysisTransformer(_BaseTransformer):
    """
    ***DEPRECATED: Has been replaced by MissingValueImputationTransformer***

    Analyzes the dataset using DQLearn's MissingValueAnalysis. Will raise an error if any missing values
    are found in the data.
    """

    def __init__(self, table_name='sensor table', included_cols=[], excluded_cols=['faildate'], missing_threshold=0.3):
        """
        Parameters
        ----------
        table_name : `str`
            Name of table that is being checked.
        included_cols : `list`
            List of columns to check. If empty, all columns exluding those in `excluded_cols` are checked.
        excluded_cols : `list`
            List of columns to check. Only applies if `included_cols` is empty.
        missing_theshold : `float`
            Maximum amount of data allowed to be missing for any attribute.
        """
        super().__init__()

        self.table_name = table_name
        self.missing_threshold = missing_threshold
        self.included_cols = included_cols
        self.excluded_cols = excluded_cols
        self.missing_val_report = None
        


    def execute(self, df, start_ts, end_ts, entities):
        from dqlearn.missing.missing_pattern import check_missing_properties

        self.logger.debug(f'begin of MissingValueAnalysisTransformer execute self.missing_threshold= {self.missing_threshold} ')

        if len(self.included_cols) > 0:
            cols_to_check = self.included_cols
        else:
            cols_to_check = [col for col in df.columns if col not in self.excluded_cols]
            self.logger.debug(f'Excluding checks on columns {self.excluded_cols}')

        self.logger.debug(f'Running missing value analysis on {len(df)} items from columns {cols_to_check}')
        missing_val_report = check_missing_properties(
                                        df=df, 
                                        data_cols = cols_to_check,
                                        default_missing_threshold=self.missing_threshold, 
                                        verify_df=True)
        self.missing_val_report = missing_val_report

        for col in cols_to_check:
            self.logger.debug(f"Missing values for column {col}: " + 
                                f"{missing_val_report['results']['details']['feature_missing_properties'][col]['num_missing']}/" +
                                f"{missing_val_report['results']['details']['feature_missing_properties'][col]['sample_size']}")
        if self.missing_rate_above_threshold(missing_val_report(), self.missing_threshold):
            # check auto_impuatation is configed
            
            raise RuntimeError(f'Training failed because the {self.table_name} data contains missing values. Ensure training data is complete.')
        self.logger.debug(f'Finished checking for missing values on {self.table_name}. Missing values count did not exceed threshold.')

        
        return df


    def missing_rate_above_threshold(self, missing_val_dict, missing_threshold):
        """Helper function that returns true if the missing value results dict shows the data was missing values.

        Parameters
        ----------
        missing_val_dict: `dict`
            missing value dictionary return from calling dqlearn.missing.missing_pattern.check_missing_properties

        Returns
        -------
        bool
            return True if dict shows missing values, else return nothing
        """
        for key, value in missing_val_dict['results']['details']['feature_missing_properties'].items():
            logger.debug(f'Checking missing values on attribute {key}')
            if value['missing_rate'] > missing_threshold:
                logger.debug(f"Attribute {key} missing value rate ({value['missing_rate']}) exceeded threshold {missing_threshold}!")
                return True
            logger.debug(f"Attribute {key} missing value rate ({value['missing_rate']}) did not exceed threshold {missing_threshold}!")
        return False



class AutoImputationTransformer(_BaseTransformer):
    """
    ***DEPRECATED: Has been replaced by MissingValueImputationTransformer***
    
    Imputes missing values in the dataset using DQLearn's AutoImputation.
    """

    def __init__(self, config,features={}, **kwargs):
        """
        Parameters
        ----------
        config : `dict`
            Dictionary specifying config to be used for auto impute. Configurable options include: `[level, save_prefix, 
            execution_type, cv_type, cv, scoring, execution_time_per_pipeline, total_execution_time, missing_vals, 
            random_state, imputation_type]`. See `help(srom.auto.auto_imputation.AutoImputation)` for details on what these 
            parameters do.
        """
        super().__init__()
        self.timestamp_column_name = kwargs.get('timestamp_column_name', 'evt_timestamp')
        self.asset_id_column_name = kwargs.get('asset_id_column_name', 'id')
        
        from srom.auto.auto_imputation import AutoImputation
        self.imputation = AutoImputation(level=config.get('level', 'default'),
                                         save_prefix=config.get('save_prefix', 'auto_imputation_'),
                                         execution_platform=config.get('execution_platform', 'spark_node_random_search'),
                                         cv_type=config.get('cv_type', 'ImputationKFold'),
                                         cv=config.get('cv', 5),
                                         scoring=config.get('scoring', 'neg_mean_absolute_error'),
                                         execution_time_per_pipeline=config.get('execution_time_per_pipeline', 2),
                                         total_execution_time=config.get('total_execution_time', 10),
                                         missing_vals=config.get('missing_vals', 0.1),
                                         random_state=config.get('random_state', 42),
                                         imputation_type=config.get('imputation_type','timeseries'))
 #                                        features = config.get('features', None))

        self.features = features
        self.logger.debug('The list of features: '+str(features))


    def execute(self, df, start_ts, end_ts, entities):
        self.logger.debug(f' Running auto imputation with configuration: {self.imputation}')
        self.logger.debug('begin of AutoImputationTransformer execute() start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        #asset_id_col = 'id'

        # non_imputable_columns like installdate and faildate
        non_imputable_columns = list(set(list(df.columns)) - set(self.features))
        self.logger.debug(f' non_imputable_columns =  {str(non_imputable_columns)}')

        df_for_imputation = df.reset_index(inplace = False, drop = False)
        df_imputed_combined = pd.DataFrame()
        for asset_id, group_df in df_for_imputation.groupby(self.asset_id_column_name):
            self.logger.debug(f'Performing auto imputation for asset {asset_id}')
            
            
 #           for col in self.features:
  #            if this.features[col]:
   #             sensor_X = df[col].values
            sensor_X = group_df[self.features].values

            #if self.logger.getEffectiveLevel() == logging.DEBUG:
            #    self.logger.debug("Printing the input dataframe 'sensor_X' as table")
            #    display(sensor_X)

            
            #faildate_df= group_df[[self.timestamp_column_name,self.asset_id_column_name,'faildate']]

            self.logger.debug('Missing values before imputation: %s', group_df.isna().sum())

            missing_values_by_columns = group_df.isna().sum()
            self.logger.debug('Missing values before imputation: %s', missing_values_by_columns)
            if missing_values_by_columns.sum() > 0:
            # begin auto imputation
                self.imputation.automate(sensor_X, sensor_X)
                self.imputation.fit(sensor_X, sensor_X)
                sensor_data_imputed_ts  = self.imputation.transform(sensor_X)
                if self.logger.getEffectiveLevel() == logging.DEBUG:
                    self.logger.debug("Printing the output dataframe 'sensor_data_imputed_ts' as table")
                    display(sensor_data_imputed_ts)
                
                # put results back into df
                df_sensor_imputed_ts = pd.DataFrame(sensor_data_imputed_ts, columns = self.features)
                df_sensor_imputed_ts[self.timestamp_column_name] = group_df[self.timestamp_column_name].values
                df_sensor_imputed_ts[self.asset_id_column_name] = asset_id
                df_sensor_imputed_ts[non_imputable_columns] = group_df[ non_imputable_columns]

                
            # new_merged_df= pd.merge(df_sensor_imputed_ts,faildate_df,on =[asset_id_col,'event_timestamp'])
                self.logger.debug(
                    'in the loop AutoImputationTransformer execute() start_ts=%s, end_ts=%s, entities=%s, imputed df=%s', 
                    start_ts, 
                    end_ts, 
                    entities, 
                    log_df_info(df_sensor_imputed_ts, head=15, logger=self.logger, log_level=logging.DEBUG)
                )

                df_sensor_imputed_ts.set_index([self.asset_id_column_name, self.timestamp_column_name], inplace = True)
                #df_sensor_imputed_ts.set_index([asset_id_col, 'event_timestamp'], inplace = True)
                df_imputed_combined = pd.concat([df_imputed_combined, df_sensor_imputed_ts], ignore_index=True)
            else:
                group_df[self.asset_id_column_name] = asset_id
                df_imputed_combined = pd.concat([df_imputed_combined, group_df], ignore_index=True)



       # df = df_imputed_combined

        # DEBUG START - section to be deleted
        #self.logger.debug(f'running data types {df_imputed_combined.dtypes}')
        self.logger.debug(f'running data cols {df_imputed_combined.columns}')
       # self.logger.debug(f'running data indices {df.index}')
        self.logger.debug(f'Missing values after imputation: {str(df_imputed_combined.isna().sum())}')
        # DEBUG END

        self.logger.debug('Finished auto imputation of missing values. The shape of the imputed data frame = '+str(df_imputed_combined.shape))
        
        return df_imputed_combined
                              
class MissingValueImputationTransformer(_BaseTransformer):
    """
    Analyzes the dataset using DQLearn's MissingValueAnalysis, and conditionally imputes the missing date. This code will 
    also raise an error if missing values are found and if the user prefers to abort the training / scoring process
    are found in the data.
    """

    def __init__(self, included_cols:list=[], excluded_cols:list=['id','evt_timestamp','faildate', 'installdate'], 
                 missing_thresholds=None, stop_if_missing_values_exceed_threshold:bool=False, 
                 asset_id_column_name:str ='id', timestamp_column_name='evt_timestamp', **kwargs):
        """
        Parameters
        ----------
        table_name : `str`
            Name of table that is being checked.
        included_cols : `list`
            List of columns to check for missing values and imputation. If empty, all columns exluding those in 
            `excluded_cols` are checked.
        excluded_cols : `list`
            List of columns to avoid for missing value analysis and imputation. All meta data and ID columns could be part 
            of this list. By default this includes `'id','evt_timestamp','faildate', 'installdate'`
        missing_thesholds : either `float` (with a common threshold for all columns) or `dict` (carrying a float value for threshold for each individual column)
            Maximum percentage of data allowed to be missing for each individual attribute. The default is `None` which means a default value of 20% will be 
            applied to all features / columns earmarked for Missing Value Analysis.
        stop_if_missing_values_exceed_threshold : `bool`
            This flag indicates the user preference as to whether the pipeline should be stopped from further execution if 
            missing values are found in **ANY** of the columns considered for Missing Value Analysis. Note that this flag 
            applies to all the features / columns that are examined for the presence of missing values
        asset_id_column_name : `str`
            This contains the name of the column in the input data frame that contains the ID of the asset / device. By 
            default this value is `id`
        timestamp_column_name : `str`
            This contains the name of the column in the input data frame that contains the timestamp of the measurement.
            Note that the data frame may have different timestamps for different purposes / reasons, but this timestamp is 
            the unique record level timestamp that can be used for indexing if needed. The default value for this parameter 
            is `evt_timestamp`
        **kwargs : `dict`
            At the moment this is expected to contain only the auto imputation config. If not provided, the auto imputation 
            will be skipped. If provided with an empty dictionary (like {}), default values will be used as outlined below.
            
            The sample auto imputation config would look like the following -
            
            `level`: 'default'
            `save_prefix` : 'auto_imputation_'
            `execution_platform` : 'spark_node_random_search'
            `cv_type` : 'ImputationKFold'
            `cv : 5`
            `scoring` : 'neg_mean_absolute_error'
            `execution_time_per_pipeline` : 2
            `total_execution_time` : 10
            `missing_vals` : 0.1
            `random_state` : any integer (default is 42)
            `imputation_type` : 'timeseries' or 'iid' (the default is 'timeseries')
            
            .
        """
        super().__init__()
        
        self.default_missing_threshold = 0.2 # just to initialize. Keep this simple, as there are no true constants in Python like C++ and Java. Not using Py3.8's Final now
        if missing_thresholds == None:
            self.logger.info('No missing value threshold was configured. Using default value of %s', self.default_missing_threshold)
            self.missing_thresholds = {}
        else:
            if isinstance(missing_thresholds, dict):
                self.missing_thresholds = missing_thresholds
            elif isinstance(missing_thresholds, float):
                self.logger.info('Only one missing value threshold %s was configured. This value will be applied as threshold for all the columns / features', missing_thresholds)
                self.default_missing_threshold = missing_thresholds
                self.missing_thresholds = {}
            else:
                raise ValueError("Inappropriate value was configured for the parameter `missing_thresholds`. The value accepted are either 'None' or a float value, or an \
                empty dictionary or a dictionary with columnwise threshold values. The type received was %s with value %s", {type(missing_thresholds)}, missing_thresholds)
        self.included_cols = included_cols
        self.excluded_cols = excluded_cols
        self.cols_for_mva_imputation = included_cols
        self.stop_if_missing_values_exceed_threshold = stop_if_missing_values_exceed_threshold
        auto_imputation_config = kwargs.get('auto_imputation_config', None)
        self.asset_id_column_name = asset_id_column_name
        self.timestamp_column_name = timestamp_column_name
        
        self.use_mcar = True # Default value if not configured
                              
        if auto_imputation_config == None:
            self.imputation = None
        else:
            self.use_mcar = auto_imputation_config.get('use_mcar', True)
            self.imputation = AutoImputation(level=auto_imputation_config.get('level', 'default'),
                                         save_prefix=auto_imputation_config.get('save_prefix', 'auto_imputation_'),
                                         execution_platform=auto_imputation_config.get('execution_platform', 'spark_node_random_search'),
                                         cv_type=auto_imputation_config.get('cv_type', 'ImputationKFold'),
                                         cv=auto_imputation_config.get('cv', 5),
                                         scoring=auto_imputation_config.get('scoring', 'neg_mean_absolute_error'),
                                         execution_time_per_pipeline=auto_imputation_config.get('execution_time_per_pipeline', 2),
                                         total_execution_time=auto_imputation_config.get('total_execution_time', 10),
                                         missing_vals=auto_imputation_config.get('missing_vals', 0.1),
                                         random_state=auto_imputation_config.get('random_state', 42),
                                         imputation_type=auto_imputation_config.get('imputation_type','timeseries'))

    def execute(self, df, start_ts, end_ts, entities):
        self.logger.info('Beginning execution of MissingValueImputationTransformer. missing_thresholds=%s', self.missing_thresholds)
        self.logger.debug('DF to check for missing values on: %s', log_df_info(df, head = 5, logger=self.logger, log_level=logging.DEBUG))
        
        if len(self.included_cols) == 0:
            self.cols_for_mva_imputation = [col for col in df.columns if col not in self.excluded_cols]
            self.logger.debug('Excluding checks on columns %s', self.excluded_cols)

        self.logger.info('Running missing value analysis on %s items from columns %s', len(df), self.cols_for_mva_imputation)
        from dqlearn.missing.missing_pattern import check_missing_properties
        missing_val_report = check_missing_properties(
                                df=df, 
                                data_cols = self.cols_for_mva_imputation,
                                missing_threshold_dict=self.missing_thresholds,
                                default_missing_threshold=self.default_missing_threshold,  
                                verify_df=True)
        missing_val_report_dict = missing_val_report()
        
        above_threshold_features = self.missing_rate_above_threshold(missing_val_report_dict, self.missing_thresholds)
        if above_threshold_features:
            if self.stop_if_missing_values_exceed_threshold:
                msg = 'Training failed because the following features exceeded their thresholds: ' 
                msg += str({feature: f'threshold: {self.missing_thresholds.get(feature, self.default_missing_threshold)}, actual: {actual}' for feature, actual in above_threshold_features.items()})
                self.logger.warning(msg)
                raise RuntimeError(msg)

            if self.imputation != None:
                self.logger.info('Finished running missing value report. Running AutoImputation for missing values...')
                return self.run_auto_imputation(
                    df, 
                    features_for_imputation = self.cols_for_mva_imputation, 
                    missing_value_characteristics = missing_val_report_dict, 
                    use_mcar = self.use_mcar, 
                    asset_id_column_name = self.asset_id_column_name, 
                    sensor_df_timestamp_column_name = self.timestamp_column_name, 
                    excluded_columns = self.excluded_cols
                )
            else: 
                self.logger.warning(
                    'Multiple features exceeded the specified threshold, but an AutoImputation configuration was not ' +
                    'provided and stop_if_missing_values_exceed_threshold is set to False, so the training will continue.'
                )
        else:
            self.logger.info('No features had missing values above the threshold, so AutoImputation will be skipped. Returning the original DataFrame as received.')
    
        return df
        

    def missing_rate_above_threshold(self, missing_val_dict: dict, missing_thresholds: dict) -> dict[str, float]:
        """
        Helper function that returns a dict of features with missing value and the percent of missing values for each feature.

        Parameters
        ----------
        missing_val_dict: `dict`
            missing value dictionary return from calling dqlearn.missing.missing_pattern.check_missing_properties

        Returns
        -------
        dict[str, float]
            dictionary containing features above their missing threshold for the keys and the actual missing percent for the key's value
        """
        missing_value_features = {}
        for feature, missing_rate in missing_val_dict['results']['details']['feature_missing_properties'].items():
            missing_threshold_for_key = missing_thresholds.get(feature, self.default_missing_threshold)
            self.logger.debug(
                'Missing values for feature %s: threshold=%s, missing_rate=%s, num_missing=%s, sample_size=%s',
                feature, 
                missing_threshold_for_key,
                missing_rate['missing_rate'],
                missing_rate['num_missing'],
                missing_rate['sample_size']
            )
            
            if missing_rate['missing_rate'] > missing_threshold_for_key:
                self.logger.debug('Attribute %s missing value rate (%s) exceeded threshold %s!', feature, missing_rate['missing_rate'], missing_threshold_for_key)
                missing_value_features[feature] = missing_rate['missing_rate']
            else:
                self.logger.debug('Attribute %s missing value rate (%s) did not exceed threshold %s!', feature, missing_rate['missing_rate'], missing_threshold_for_key)

        self.logger.info('Features with missing value rates above the threshold are: %s', missing_value_features)
        return missing_value_features

    
    def run_auto_imputation(self, df, features_for_imputation, missing_value_characteristics, use_mcar = True, asset_id_column_name = 'id', 
                            sensor_df_timestamp_column_name = 'evt_timestamp', excluded_columns = []):
        self.logger.debug('Running auto imputation...')
        self.logger.debug('AutoImputation configuration: %s', self.imputation)
        
        df_for_imputation = df
        df_imputed_combined = pd.DataFrame()
        
        missing_value_characteristics_by_feature = missing_value_characteristics['results']['details']['feature_missing_properties']   
        
        if df.index.nlevels > 1:
            df_for_imputation = df.reset_index(inplace = False, drop = False) # This will increase memory footprint, but safer this way than resetting inplace
        
        # Make sure the excluded columns blacklists only those that are present in the dataframe. Remove the superfluous columns. This will make it possible for 
        # setting a unified list of excluded columns once, while allowing the use case specific dataframes to be passed in with or without those columns
        columns_in_df = df_for_imputation.columns.tolist()
        self.logger.debug('The columns in the input data frame (after flattening multi-index if applicable) are %s = ', columns_in_df)
        excluded_columns = [col for col in excluded_columns if col in columns_in_df]  # This is needed if the user provides columns in addition to what is present in the data frame
        self.logger.debug('The columns excluded from the imputation are %s = ', excluded_columns)
        
        if use_mcar:
            self.logger.debug('**NOTE** MCAR condition is enabled, meaning values will only be ' +
                              'imputed if they are Missing Completely At Random. However, also ' +
                              'note that the MCAR condition cannot be confirmed with absolute certainty.')
        else:
            self.logger.debug('**NOTE** that the MCAR condition is not checked. Make sure the ' + 
                              'values are not missing for reason or following specific pattern.' +
                              'We do not want to impute values if there is a good reason for ' +
                              'them being missing, like the machine was under service and not running.')
        self.logger.debug('Iterating over assets for AutoImputation...')
        for asset_id, grouped_df in df_for_imputation.groupby(asset_id_column_name):
            self.logger.debug('Performing auto imputation for asset %s', asset_id)
            missing_value_count = grouped_df.isna().sum()
            self.logger.debug('Missing value count for the asset %s is %s', asset_id, log_df_info(grouped_df, head=0, include_missing_value_count=True, logger=self.logger, log_level=logging.DEBUG))
            
            if missing_value_count.sum() == 0:
                self.logger.debug('No missing values found in the data frame for asset %s. Skipping auto imputation and using this as is.', asset_id)
                grouped_df[asset_id_column_name] = asset_id
                grouped_df.set_index([asset_id_column_name, sensor_df_timestamp_column_name], inplace = True)
                #df_imputed_combined = df_imputed_combined.append(grouped_df)
                df_imputed_combined = pd.concat([df_imputed_combined, grouped_df])
            else:
                features_missing_values = []
                features_not_missing_values = []
                features_mcar = []
                features_not_mcar = []
                for key in missing_value_count.keys():
                    if missing_value_count[key] > 0:
                        if not key in excluded_columns: 
                            features_missing_values.append(key)
                    else:
                        features_not_missing_values.append(key)
                if not use_mcar:
                    self.logger.debug('Ignoring MCAR pattern check as it is not required.')
                    features_mcar = features_missing_values
                    self.logger.debug('Features that contain random missing values for the asset %s are %s', asset_id, features_mcar)
                else: # honor MCAR check
                    self.logger.debug('Checking the MCAR patterns of the individual features.')
                    for feature in features_missing_values:
                        mvc_for_feature = missing_value_characteristics_by_feature.get(feature,{})
                        self.logger.debug('Missing value characteristics for the asset %s and %s are %s', asset_id, feature, mvc_for_feature)
                                            
                        if mvc_for_feature.get('reject_MCAR',True):
                            features_not_mcar.append(feature)
                        else:
                            features_mcar.append(feature) # **ONLY THESE FEATURES WILL BE IMPUTED**
                    self.logger.debug(f'Features that possibly contain random missing values for the asset %s are %s', asset_id, features_mcar)
                
                final_list_of_features_to_exclude = list(set(excluded_columns + features_not_missing_values + features_not_mcar))
                non_imputed_df =grouped_df[final_list_of_features_to_exclude]
                self.logger.debug(
                    'DF for asset_id=%s. This is asset specific data not considered for AutoImputation, meaning it will be returned as is. non_imputed_df=%s', 
                    asset_id, 
                    log_df_info(non_imputed_df, head=5, logger=self.logger, log_level=logging.DEBUG)
                )
                
                if len(features_mcar) > 0:
                    frame_to_be_imputed = grouped_df[features_mcar].values
                    self.logger.debug('Make sure the slice of data frame to be imputed has the same number of rows as the frame that contains excluded cols:\
                    frame to be imputed = %s frame to be preserved (not imputed) and added = %s', str(frame_to_be_imputed.shape), str(non_imputed_df.shape))
                    # begin auto imputation
                    self.imputation.automate(frame_to_be_imputed, frame_to_be_imputed)
                    self.imputation.fit(frame_to_be_imputed, frame_to_be_imputed)
                    frame_after_imputation  = self.imputation.transform(frame_to_be_imputed)
            
                    # put results back into df. Make sure to fill the asset ids, timestamps and any other identifiers back. *WARNING - an Insidious bug is likely*
                    frame_after_imputation_df = pd.DataFrame(frame_after_imputation, columns = features_mcar)
                    
                    non_imputed_columns_to_append = list(set(sorted(final_list_of_features_to_exclude)) - \
                                            set(sorted([asset_id_column_name,sensor_df_timestamp_column_name])))
                    if len(non_imputed_columns_to_append) > 0:
                        self.logger.debug('Reconstructing the final data frame by adding the non-imputed columns: %s to the imputed data frame',non_imputed_columns_to_append)
                        frame_after_imputation_df[non_imputed_columns_to_append] = non_imputed_df[non_imputed_columns_to_append]
                    else:
                         self.logger.debug('No non-imputed / static columns to append to the imputed data frame for the asset %s ', asset_id)
                    
                    #if len(final_list_of_features_to_exclude) > 0:
                        #frame_after_imputation_df[final_list_of_features_to_exclude] = non_imputed_df
                      
                    frame_after_imputation_df[asset_id_column_name] = asset_id
                    frame_after_imputation_df[sensor_df_timestamp_column_name] = grouped_df[sensor_df_timestamp_column_name].values
                
                    self.logger.debug(
                        'DF for asset=%s. This is asset specific data whose missing values have been imputed with AutoImputation. df_sensor_imputed_ts=%s', 
                        asset_id, 
                        log_df_info(frame_after_imputation_df, head=5, include_missing_value_count = True, logger=self.logger, log_level=logging.DEBUG)
                    )
                    frame_after_imputation_df.set_index([asset_id_column_name, sensor_df_timestamp_column_name], inplace = True)
                    #df_imputed_combined = df_imputed_combined.append(frame_after_imputation_df)
                    df_imputed_combined = pd.concat([df_imputed_combined, frame_after_imputation_df])
                else:
                    self.logger.debug(
                        'Even though there are missing values found in the data frame for asset %s, ' +
                        'none of the features pass the test of MCAR. Therefore skipping auto ' +
                        'imputation and returning the original dataframe for the asset as received. ' +
                        'Either address the missing values at the source or turn off the use_mcar check', 
                        asset_id)
                    grouped_df[asset_id_column_name] = asset_id
                    grouped_df.set_index([asset_id_column_name, sensor_df_timestamp_column_name], inplace = True)
                    #df_imputed_combined = df_imputed_combined.append(grouped_df)

                    df_imputed_combined = pd.concat([df_imputed_combined, grouped_df])
                    

        self.logger.debug('Finished auto imputation of missing values. Returning this DataFrame: %s', log_df_info(df_imputed_combined, head=5, logger=self.logger, log_level=logging.DEBUG))
        return df_imputed_combined

class ContextTransformer(_BaseTransformer):
    """This transformer generates a dataframe that filter out the data based on context_expression.

    
    Sample context_expression: VELOCITYX > 0.1214597878601365' & VELOCITYY < 0.0214597878601365'

    
    """
    def __init__(self, context_expression, exclude_column_list=None):
        """
        Parameters
        ----------
        context_expression :  `str`
            The context_expression .
        
        """

        super().__init__()

        if context_expression is None:
            raise ValueError('context_expression must be an non-empty ')
        self.context_expression = context_expression
        self.exclude_column_list= exclude_column_list

        
    
    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('Beginning execution of ContextTransformer. Input: start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        self.logger.debug('Excluding columns: %s', self.exclude_column_list)
        

        # pick only needed columns for scoring 
        self.logger.debug('ContextTransformer execute self.context_expression is %s', self.context_expression)
        if self.context_expression == 'no_filter':
            modified_df = df
        else:
            modified_df = df.query(self.context_expression)
        
        if self.exclude_column_list is not None:
            col_set=set(modified_df.columns)
            new_columns= col_set - set(self.exclude_column_list)
            modified_df= modified_df[list(new_columns)]
        
        self.logger.debug('Finished executing ContextTransformer. Results: modified_df=%s', log_df_info(modified_df, head=5, logger=self.logger, log_level=logging.DEBUG))

        return modified_df
