# -*- coding: utf-8 -*-

# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""
.. module:: temporal_feature_eng
   :synopsis: Fetaure engineering based on timestamps and windows of time
.. moduleauthor:: Amaresh Rajasekharan, Kewei Yang

"""

import pandas as pd
import numpy as np
import datetime
from datetime import datetime
from datetime import timedelta
import logging

from pmlib import util
from srom.feature_engineering.timeseries.rolling_window_feature_extraction import \
    (simple_summary_statistics,higher_order_summary_statistics, advance_summary_statistics)

def create_temporal_features(df, variable_names, rolling_window_size, minimum_periods, \
        simple_agg_fns, higher_order_agg_fns, adv_agg_fns, asset_id_column_name = 'id', \
        timestamp_column_name = 'evt_timestamp', timestamp_format=None, data_source_type = 'mas_monitor_data_lake', flatten_headers = True):
    
    """
        Create temporal features from timestamped records

        Parameters
        ----------
        df : `pandas.DataFrame`, required, records in this dataframe will be used for creating the \
             temporal features: 
        variable_names : `list`, optional. Only variables identified in this list will be considered for
            feature engineering. If this is None, or empty, all variables in the dataframe with the exception
            of asset id and timestamp will be used for feature engineering
        rolling_window_size: `str`, required. example: '3H' Pandas timestamp aliases should be used
        minimum_periods: `str`, can specify this explicitly if needed. Look at Pandas.DataFrame.rolling
        simple_agg_fns: `list`, One or more of the strings from the following list:
                        "mean", "max", "min", "median", "std", "sum", "count" or ALL
        higher_order_agg_fns: `list` One or more of the strings from the following list:
                              'sum','skew','kurt','quantile_25','quantile_75','quantile_range' or ALL
        adv_agg_fns: `list` One or more of the strings from the following list:
                    'rate_of_change', 'sum_of_change','absoluate_sum_of_changes','trend_slop',\
                    'abs_energy','mean_abs_change','mean_change','mean_second_derivate_central',\
                    'count_above_mean','count_below_mean' or ALL
        asset_id_column_name: `str`, required. If not provided the default value "id" will be used
        timestamp_column_name: `str` required. If not provided the default value "evt_timestamp" will be used
        timestamp_format: `str` Optional.
        data_source_type: `str` Optional. If not provided, the default value "mas_monitor_data_lake" will be used.\
                          Accepted values are "mas_monitor_data_lake" and "csv". Values are case insensitive.
        flatten_headers: `bool` Optional. If not provided, the default value True will be used. This is used to
                         extract the columns in a multi-indexed data frame. Note that this will not reindex
                         the output dataframe as received in the multi-index form.
        return dataframe with feature engineered input df. The original columns will also be retained 
               and returned in the output.
    """
    print('Creating simple temporal features')
    df = create_simple_temporal_features(df, variable_names, rolling_window_size, minimum_periods,\
                                         simple_agg_fns, asset_id_column_name, timestamp_column_name,\
                                         timestamp_format,data_source_type, flatten_headers)
    print('Creating higher order temporal features')
    df = create_higher_order_temporal_features(df, variable_names, rolling_window_size, minimum_periods,\
                                         higher_order_agg_fns, asset_id_column_name, timestamp_column_name,\
                                         timestamp_format,data_source_type, flatten_headers)
    print('Creating advanced temporal features')
    df = create_advanced_temporal_features(df, variable_names, rolling_window_size, minimum_periods,\
                                         adv_agg_fns, asset_id_column_name, timestamp_column_name,\
                                         timestamp_format,data_source_type, flatten_headers)
    return df

def create_simple_temporal_features(df:pd.DataFrame, variable_names:list, rolling_window_size:str,\
        minimum_periods:str, agg_fns:list, asset_id_column_name:str = 'id',\
        timestamp_column_name:str = 'evt_timestamp', timestamp_format:str = None,\
        data_source_type:str = 'mas_monitor_data_lake', flatten_headers:bool = True)->pd.DataFrame:
    """
    Create temporal features like "mean", "max", "min", "median", "std", "sum", and "count" from \
    timestamped records

    Parameters
    ----------
    df : `pandas.DataFrame`, required, records in this dataframe will be used for creating the \
            temporal features: 
    variable_names : `list`, optional. Only variables identified in this list will be considered for
        feature engineering. If this is None, or empty, all variables in the dataframe with the exception
        of asset id and timestamp will be used for feature engineering
    rolling_window_size: `str`, required. example: '3H' Pandas timestamp aliases should be used
    minimum_periods: `str`, can specify this explicitly if needed. Look at Pandas.DataFrame.rolling
    agg_fns: `list`, One or more of the strings from "mean", "max", "min", "median", "std", "sum",\
             "count" 
    asset_id_column_name: `str`, required. If not provided the default value "id" will be used
    timestamp_column_name: `str` required. If not provided the default value "evt_timestamp" will be used
    timestamp_format: `str` Optional.
    data_source_type: `str` Optional. If not provided, the default value "mas_monitor_data_lake" will be used.\
                        Accepted values are "mas_monitor_data_lake" and "csv". Values are case insensitive.
    flatten_headers: `bool` Optional. If not provided, the default value True will be used. This is used to
                        extract the columns in a multi-indexed data frame. Note that this will not reindex
                        the output dataframe as received in the multi-index form.
    return dataframe with feature engineered input df. The original columns will also be retained 
            and returned in the output.
    """
    if agg_fns == None or len(agg_fns) == 0:
        print('create_simple_temporal_features: returning the input data frame as is because no aggregation\
               functions are specified for the function')
        return df
    elif (agg_fns == 'ALL'):
        agg_fns = ["mean", "max", "min", "median", "std", "sum", "count"]

    if data_source_type == 'mas_monitor_data_lake' or flatten_headers:
        df.reset_index(inplace = True, drop = False)
    
    input_df_columns = df.columns.values.tolist()
    if 'index' in input_df_columns:
      df.drop('index', axis = 1, inplace = True)
      input_df_columns = df.columns.values.tolist()
      
    if (not variable_names == None) and (len(variable_names) > 0):
        input_df_columns = variable_names+[asset_id_column_name, timestamp_column_name]

    simple_temporal_stats_df = simple_summary_statistics(df, rolling_window_size, variable_names,
    asset_id_column_name, timestamp_column_name, date_clm_format=timestamp_format,
    min_periods=minimum_periods, aggregation_type="time", aggregation_methods = agg_fns)
    fe_columns = sorted(list(set(simple_temporal_stats_df.columns.values.tolist()) - set(input_df_columns)))
    df[fe_columns] = simple_temporal_stats_df[fe_columns]
    df.drop(['asset_id','datetime'], axis = 1, inplace = True)
    print('Columns in the data frame after computing simple summary statistics = ', str(df.columns.values.tolist()))
    return df

def create_higher_order_temporal_features(df:pd.DataFrame, variable_names:list, rolling_window_size:str,\
        minimum_periods:str, agg_fns:list, asset_id_column_name:str = 'id',\
        timestamp_column_name:str = 'evt_timestamp', timestamp_format:str = None,\
        data_source_type:str = 'mas_monitor_data_lake', flatten_headers:bool = True)->pd.DataFrame:
    """
    Create temporal features like 'sum','skew','kurt','quantile_25','quantile_75','quantile_range' from \
    timestamped records

    Parameters
    ----------
    df : `pandas.DataFrame`, required, records in this dataframe will be used for creating the \
            temporal features: 
    variable_names : `list`, optional. Only variables identified in this list will be considered for
        feature engineering. If this is None, or empty, all variables in the dataframe with the exception
        of asset id and timestamp will be used for feature engineering
    rolling_window_size: `str`, required. example: '3H' Pandas timestamp aliases should be used
    minimum_periods: `str`, can specify this explicitly if needed. Look at Pandas.DataFrame.rolling
    agg_fns: `list`, One or more of the strings from 'sum','skew','kurt','quantile_25','quantile_75',\
            'quantile_range' 
    asset_id_column_name: `str`, required. If not provided the default value "id" will be used
    timestamp_column_name: `str` required. If not provided the default value "evt_timestamp" will be used
    timestamp_format: `str` Optional.
    data_source_type: `str` Optional. If not provided, the default value "mas_monitor_data_lake" will be used.\
                        Accepted values are "mas_monitor_data_lake" and "csv". Values are case insensitive.
    flatten_headers: `bool` Optional. If not provided, the default value True will be used. This is used to
                        extract the columns in a multi-indexed data frame. Note that this will not reindex
                        the output dataframe as received in the multi-index form.
    return dataframe with feature engineered input df. The original columns will also be retained 
            and returned in the output.
    """
    if agg_fns == None or len(agg_fns) == 0:
        print('create_higher_order_temporal_features: returning the input data frame as is because no\
               aggregation functions are specified for the function')
        return df
    elif (agg_fns == 'ALL'):
        agg_fns = ['sum','skew','kurt','quantile_25','quantile_75','quantile_range']

    if data_source_type == 'mas_monitor_data_lake' or flatten_headers:
        df.reset_index(inplace = True, drop = False)
    
    input_df_columns = df.columns.values.tolist()
    
    if 'index' in input_df_columns:
      df.drop('index', axis = 1, inplace = True)
      input_df_columns = df.columns.values.tolist()
      
    if (not variable_names == None) and (len(variable_names) > 0):
        input_df_columns = variable_names+[asset_id_column_name, timestamp_column_name]

    higher_order_stats_df = higher_order_summary_statistics(df, rolling_window_size, variable_names,
    asset_id_column_name, timestamp_column_name, date_clm_format=timestamp_format,
    min_periods=minimum_periods, aggregation_type="time", aggregation_methods = agg_fns)
    fe_columns = sorted(list(set(higher_order_stats_df.columns.values.tolist()) - set(input_df_columns)))
    df[fe_columns] = higher_order_stats_df[fe_columns]
    df.drop(['asset_id','datetime'], axis = 1, inplace = True)
    print('Columns in the data frame after computing higher order summary statistics = ', str(df.columns.values.tolist()))
    return df

def create_advanced_temporal_features(df:pd.DataFrame, variable_names:list, rolling_window_size:str,\
        minimum_periods:str, agg_fns:list, asset_id_column_name:str = 'id',\
        timestamp_column_name:str = 'evt_timestamp', timestamp_format:str = None,\
        data_source_type:str = 'mas_monitor_data_lake', flatten_headers:bool = True)->pd.DataFrame:
    """
    Create temporal features like 'rate_of_change', 'sum_of_change','absoluate_sum_of_changes',\
    'trend_slop','abs_energy','mean_abs_change','mean_change','mean_second_derivate_central',\
    'count_above_mean','count_below_mean' from timestamped records

    Parameters
    ----------
    df : `pandas.DataFrame`, required, records in this dataframe will be used for creating the \
            temporal features: 
    variable_names : `list`, optional. Only variables identified in this list will be considered for
        feature engineering. If this is None, or empty, all variables in the dataframe with the exception
        of asset id and timestamp will be used for feature engineering
    rolling_window_size: `str`, required. example: '3H' Pandas timestamp aliases should be used
    minimum_periods: `str`, can specify this explicitly if needed. Look at Pandas.DataFrame.rolling
    agg_fns: `list`, One or more of the strings from 'rate_of_change', 'sum_of_change',\
            'absoluate_sum_of_changes','trend_slop','abs_energy','mean_abs_change','mean_change',\
            'mean_second_derivate_central','count_above_mean','count_below_mean'
    asset_id_column_name: `str`, required. If not provided the default value "id" will be used
    timestamp_column_name: `str` required. If not provided the default value "evt_timestamp" will be used
    timestamp_format: `str` Optional.
    data_source_type: `str` Optional. If not provided, the default value "mas_monitor_data_lake" will be used.\
                        Accepted values are "mas_monitor_data_lake" and "csv". Values are case insensitive.
    flatten_headers: `bool` Optional. If not provided, the default value True will be used. This is used to
                        extract the columns in a multi-indexed data frame. Note that this will not reindex
                        the output dataframe as received in the multi-index form.
    return dataframe with feature engineered input df. The original columns will also be retained 
            and returned in the output.
    """
    if agg_fns == None or len(agg_fns) == 0:
        print('create_advanced_temporal_features: returning the input data frame as is because no\
               aggregation functions are specified for the function')
        return df
    elif (agg_fns == 'ALL'):
        agg_fns = ['rate_of_change', 'sum_of_change','absoluate_sum_of_changes','trend_slop',\
                   'abs_energy','mean_abs_change','mean_change','mean_second_derivate_central',\
                   'count_above_mean','count_below_mean']

    if data_source_type == 'mas_monitor_data_lake' or flatten_headers:
        df.reset_index(inplace = True, drop = False)
    
    input_df_columns = df.columns.values.tolist()
      
    if 'index' in input_df_columns:
      df.drop('index', axis = 1, inplace = True)
      input_df_columns = df.columns.values.tolist()
      
    if (not variable_names == None) and (len(variable_names) > 0):
        input_df_columns = variable_names+[asset_id_column_name, timestamp_column_name]

    advanced_order_stats_df = advance_summary_statistics(df, rolling_window_size, variable_names,
    asset_id_column_name, timestamp_column_name, date_clm_format=timestamp_format,
    min_periods=minimum_periods, aggregation_type="time", aggregation_methods = agg_fns)
    fe_columns = sorted(list(set(advanced_order_stats_df.columns.values.tolist()) - set(input_df_columns)))
    df[fe_columns] = advanced_order_stats_df[fe_columns]
    df.drop(['asset_id','datetime'], axis = 1, inplace = True)
    print('Columns in the data frame after computing advanced summary statistics = ', str(df.columns.values.tolist()))
    return df
