# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

__pdoc__ = {
    'AssetLoader.is_data_source': False,
    'AssetLoader.merge_method': False,
}

import datetime as dt
import json
import logging
import os
from collections import defaultdict
from datetime import timedelta

#Kewei remove xml.etree.ElementTree per Abigail request
#from xml.etree.ElementTree import iselement

import numpy as np
import pandas as pd
import sqlalchemy

from . import api
from .transformer import _BaseTransformer
from .util import get_logger, log_df_info


class AssetLoader(_BaseTransformer):

    is_data_source = True
    """Asset data loader is a data source in AS.

    Data source functions are executed at the beginning stage of AS pipelines.
    """

    merge_method = 'replace'
    """Asset data loader is primary data source which always replaces upstream dataframe with its own.

    Using replace merge method means our asset data loader is the one responsible for what should be the 
    dataframe passed to downstream. Internally, our asset data loader does all the necessary merging and 
    joining, between the input upstream dataframe and loaded asset data. This merge method makes sure 
    AS does not intervene with this logic.
    """

    ASSET_TYPE = ''

    def __init__(self, asset_group, _entity_type, data_items, names, resamples, entity_type_metadata, asset_device_mappings, 
                 fillna='ffill', fillna_exclude=None, dropna='any', dropna_exclude=None, use_cm=False, cm_code='CM'):
        super().__init__()
        self.asset_group = asset_group
        self._entity_type = _entity_type

        self.entity_type_metadata = entity_type_metadata

        self._init_data_items(data_items, names)

        if resamples is None:
            resamples = dict()
        self.resamples = resamples

        # give highest priority to mappings from parameter which is assumed to beb used 
        # only by testing purpose (offline usage), this essentially stops the mapping 
        # loading from local cache (of remote system)
        self._set_asset_device_mappings(asset_device_mappings)

        self.fillna = fillna
        self.fillna_exclude = fillna_exclude
        self.dropna = dropna
        self.dropna_exclude = dropna_exclude

        self.data_substitution = {}

        self.df_asset_id_column = 'asset_id'

        self.use_cm=use_cm

        self.cm_code=cm_code

    def _init_data_items(self, data_items, names, skip_validation=False):
        if not isinstance(data_items, list) or not isinstance(names, list) or len(data_items) != len(names):
            raise ValueError('arguments data_items and names must be both list and of same length')

        self.data_items = data_items
        self.names = names

        # validate the given data_items to see if any invalid entity type
        if skip_validation is not True:
            self._validate_data_items()

    def execute(self, df=None, start_ts=None, end_ts=None, entities=None):
        self.df_traces = dict()

        self.logger.info('Executing asset loader. Args: start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        if df is not None and df.empty:
            df = None

        # mapping loading is deferred to execute time but not init time
        self._load_asset_device_mappings()

        if len(self.asset_device_mappings) == 0:
            # empty mapping, we can't do anything without mapping
            if api.is_local_mode():
                raise ValueError('cannot find any asset_device_mappings, parameters asset_device_mappings must be given to create pipeline objects when in local mode')
            else:
                raise ValueError('cannot find any asset_device_mappings, add some from UI or by using function pmlib.set_asset_device_mappings()')

        data_items_copy = self.data_items.copy()
        names_copy = self.names.copy()

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp
        df_entity_type_column = 'entity_type'

        # construct the asset-device mapping dataframe
        entity_types_used = set()
        df_mappings = pd.DataFrame(columns=[self.df_asset_id_column, df_entity_type_column, df_index_id_name])
        for asset_id, device_ids in self.asset_device_mappings.items():
            if len(device_ids) == 0:
                # for assets without sensors associated, still keep it so we can load asset data
                df_mappings = pd.concat([df_mappings, pd.DataFrame([{self.df_asset_id_column: asset_id}])], ignore_index=True)

                continue

            for device_id in device_ids:
                entity_type, entity_id = device_id.split(':')
                df_mappings = pd.concat([df_mappings, pd.DataFrame([{self.df_asset_id_column: asset_id, df_entity_type_column: entity_type, df_index_id_name: entity_id}])], ignore_index=True)
                entity_types_used.add(entity_type)
        self.df_traces['mappings'] = df_mappings
        self.logger.debug('Converted asset device mappings to DataFrame: df_mappings=%s', log_df_info(df_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

        # now loop through all the feature used types, including asset data

        '''
        For time series data frame, IOT entity types loaded ones have id column 'deviceid' and corresponding entity type's 
        timestamp column, with two new columns added: 'entityt_type' and 'asset_id'. Timestamp columns are renamed to be 
        'event_timestamp'. For asset time series data frames, they should have id column named as 'asset_id' and timestamp 
        column named 'event_timestamp'. Methods calling Maximo API to generate asset data frames already conform to this. 
        For table loaded data frames, make sure they are renamed properly. Similarly for non-time-series asset data, for 
        the 'asset_id' id column. This is required in order to merge/join all the data frame together at the end.
        One caveat is that iotfunctions automatically adds 'id' and '_timestamp' columns if data is loaded by it.
        '''

        self.logger.debug('Iterating over entity types to prep DataFrames for merging...')
        all_dfs = []
        asset_dfs = []
        asset_dimension = None # for keeping asset metadata dimensions to be joined after all others are merged
        for entity_type, data_items in self.entity_type_meta.items():
            is_asset_data = entity_type == self.ASSET_TYPE

            self.logger.debug('Current iteration: entity_type=%s, data_items=%s, is_asset_data=%s', entity_type, data_items, is_asset_data)

            if entity_type not in entity_types_used and not is_asset_data:
                # entity types without actual association with assets are no use since 
                # everything starts with assets, just ignore them
                continue

            entity_type_df = None

            if is_asset_data:
                if entity_type in self.data_substitution:
                    self.logger.debug('Using substitution for asset')

                    entity_type_df, asset_dimension = self.data_substitution[entity_type]

                    # filter by time range, if given
                    if start_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] >= pd.to_datetime(start_ts)]
                    if end_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] < pd.to_datetime(end_ts)]

                    # make siteid and assetnum all upper case since Maximo does that
                    if entity_type_df is not None and not entity_type_df.empty:
                        entity_type_df[self.df_asset_id_column] = entity_type_df[self.df_asset_id_column].str.upper()
                    if asset_dimension is not None and not asset_dimension.empty:
                        asset_dimension[self.df_asset_id_column] = asset_dimension[self.df_asset_id_column].str.upper()

                    self.df_traces['loaded_asset_ts'] = entity_type_df
                    self.logger.debug('df_loaded_asset_ts=%s', log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

                    self.df_traces['loaded_asset_dimension'] = asset_dimension
                    self.logger.debug('df_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))
                else:
                    # need to separate asset time-series data from non-time-series data, because non-time-series need to be 
                    # joined at the very end after all other asset/device time-series data are merged
                    self.logger.debug('Separating asset time series data from non time series data...')

                    self.logger.debug('data_items=%s', data_items)
                    ts_data_items = set(data_items) & {'faildate', 'failurecode', 'problemcode','causecode','remedycode'}
                    self.logger.debug('Updated timeseries data items: ts_data_items=%s', ts_data_items)

                    non_ts_data_items = set(data_items) - ts_data_items
                    self.logger.debug('Final time series data items: ts_data_items=%s', ts_data_items)
                    self.logger.debug('Final non time series data items: non_ts_data_items=%s', non_ts_data_items)

                    # when maximo is linked and the asset group is actually present in maximo, load asset data from
                    # maximo directly. otherwise, load asset data from the special internal asset entity type

                    # TODO can we avoid this asset group mapping API call?
                    if api.get_maximo_asset_device_mappings(self.asset_group) is not None:
                        # first prepare list of assets in Maximo non-composite form
                        maximo_assets = []
                        for asset_id, mapping in self.asset_device_mappings.items():
                            asset_num, site_id = asset_id.split('-____-')
                            maximo_assets.append({"assetNum":asset_num, "siteId": site_id})
                        
                        if len(maximo_assets) > 0:
                            # time-series
                            if len(ts_data_items):
                                if self.use_cm == False:
                                    entity_type_df = api.get_asset_failure_history(assets=maximo_assets, data_items=list(ts_data_items), df_id_column=self.df_asset_id_column, df_timestamp_name=df_index_timestamp_name, start_ts=start_ts, end_ts=end_ts)
                                else:
                                    self.logger.debug('cm_code=%s', self.cm_code)
                                    entity_type_df = api.get_asset_corrective_maintenance_history(assets=maximo_assets, data_items=list(ts_data_items), df_id_column=self.df_asset_id_column, df_timestamp_name=df_index_timestamp_name, start_ts=start_ts, end_ts=end_ts,cm_code= self.cm_code)
                                self.df_traces['loaded_asset_ts'] = entity_type_df
                                if entity_type_df.shape[0] < 3 :
                                    self.logger.debug('WARNING: not enough failure history found. If you are building failure prediction model, a minimum 3 failure records is required. Otherwise if you are building unsupervised anomaly detection model you can ignore this warning')
                            else:
                                entity_type_df = None

                            # non-time-series dimensions
                            if len(non_ts_data_items) > 0:
                                asset_dimension = api.get_asset_attributes(assets=maximo_assets, data_items=list(non_ts_data_items), df_id_column=self.df_asset_id_column)
                                self.df_traces['loaded_asset_dimension'] = asset_dimension
                                self.logger.debug('df_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))
                    else:
                        # get the special internal asset entity type for loading asset data (from table)
                        asset_cache_entity_type = api._get_asset_cache_entity_type()

                        # time-series

                        if len(ts_data_items):
                            entity_type_df = asset_cache_entity_type.db.read_table(
                                table_name=asset_cache_entity_type.name,
                                schema=asset_cache_entity_type._db_schema,
                                timestamp_col=asset_cache_entity_type._timestamp,
                                parse_dates=None,
                                columns=[asset_cache_entity_type._entity_id, asset_cache_entity_type._timestamp] + list(ts_data_items),
                                start_ts=start_ts,
                                end_ts=end_ts,
                                entities=list(self.asset_device_mappings.keys()),
                                dimension=None
                            )

                            # rename id and timestamp columns to be the same ones so we can concat later
                            entity_type_df = entity_type_df.rename(columns={
                                asset_cache_entity_type._entity_id: self.df_asset_id_column,
                                asset_cache_entity_type._timestamp: df_index_timestamp_name,
                            })

                            self.df_traces['local_loaded_asset_ts'] = entity_type_df
                            self.logger.debug('df_local_loaded_asset_ts=%s', log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))
                        else:
                            entity_type_df = None

                        # non-time-series dimensions

                        if len(non_ts_data_items) > 0:
                            asset_dimension = asset_cache_entity_type.db.read_table(
                                table_name=asset_cache_entity_type._dimension_table_name,
                                schema=asset_cache_entity_type._db_schema,
                                columns=[asset_cache_entity_type._entity_id] + list(non_ts_data_items),
                                entities=list(self.asset_device_mappings.keys())
                            )

                            # rename id column to be the same one so we can concat later
                            asset_dimension = asset_dimension.rename(columns={
                                asset_cache_entity_type._entity_id: self.df_asset_id_column,
                            })

                            self.df_traces['local_loaded_asset_dimension'] = asset_dimension
                            self.logger.debug('df_local_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))

                # TODO what to do when no asset data is loaded?

                if entity_type_df is None:
                    continue
            else: # not asset data (sensor data)
                if entity_type in self.data_substitution:
                    self.logger.debug('using substitution for entity_type=%s', entity_type)

                    entity_type_df = self.data_substitution[entity_type]

                    # filter by time range, if given
                    if start_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] >= pd.to_datetime(start_ts)]
                    if end_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] < pd.to_datetime(end_ts)]
                else:
                    entity_type_service = api.get_entity_type(entity_type, db=self._entity_type.db)

                    # always load as all-lower-case column names since sqlalchemy is tricky in some cases
                    # we'll rename columns back to correct cases at the end (after merged)
                    columns_to_load = [col for col in data_items]

                    # figure out if resampling is configured, and if yes, change data_items/names accordingly
                    time_grain, agg_methods, agg_outputs = None, None, None
                    if self.resamples.get(entity_type, None) is not None:
                        time_grain, agg_methods, agg_outputs = self.resamples[entity_type]
                        data_items = [v for k, vv in agg_outputs.items() for v in vv]

                        # substitute the original feature names with the resampled features
                        original_featur_names = [d.split(':')[1] for d in data_items_copy if d.split(':')[0] == entity_type]
                        for idx, feature in enumerate(original_featur_names):
                            original_idx = data_items_copy.index('%s:%s' % (entity_type, feature))
                            data_items_copy[original_idx:original_idx+1] = ['%s:%s' % (entity_type, f) for f in agg_outputs[feature]]
                            names_copy[original_idx:original_idx+1] = agg_outputs[feature]

                        self.logger.debug('After resampled feature substitution, data_items_copy=%s, names_copy=%s', data_items_copy, names_copy)

                    self.logger.debug('Before call to get_entity_type_data: start_ts=%s, end_ts=%s, entity_type=%s, columns_to_load=%s, time_grain=%s, agg_methods=%s, agg_outputs=%s', start_ts, end_ts, entity_type, columns_to_load, time_grain, agg_methods, agg_outputs)
                    try:
                        entity_type_df = api.get_entity_type_data(entity_type,entity_type=entity_type_service, start_ts=start_ts, end_ts=end_ts, entities=entities, data_items=columns_to_load, time_grain=time_grain, agg_methods=agg_methods, agg_outputs=agg_outputs)
                        entity_type_df = entity_type_df.reset_index()
                    except TypeError as err:
                        if str(err) == "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'":
                            # TODO iotfunctions.util.resample has an defect when the read dataframe is empty
                            entity_type_df = pd.DataFrame(columns=[df_index_id_name, entity_type_service._timestamp] + data_items)
                        else:
                            raise RuntimeError('error getting data') from err
                    except BaseException as err:
                        raise RuntimeError('error getting data') from err

                    self.df_traces['loaded_%s_before_dropna_rename' % entity_type] = entity_type_df

                    # drop those rows with NA values for all data items to be loaded
                    if entity_type_df.shape[0] >0:
                        entity_type_df = entity_type_df.dropna(how='all', subset=data_items)

                    # rename timestamp column to be the one used by the asset group entity type so we can concat later
                    entity_type_df = entity_type_df.rename(columns={
                        entity_type_service._timestamp: df_index_timestamp_name
                    })

                self.df_traces['loaded_%s' % entity_type] = entity_type_df
                
                entity_type_df = entity_type_df.rename(columns={
                        'entity_id':'id', 'timestamp':'event_timestamp' })
                self.logger.debug('DF for entity_type=%s: %s', entity_type, log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

                # force set the entity type column
                entity_type_df[df_entity_type_column] = entity_type

                # now merge with mappings to get asset_id column
                entity_type_df = pd.merge(left=entity_type_df, right=df_mappings, how='inner', on=[df_entity_type_column, df_index_id_name], sort=False)

            # at this point, entity_type_df is expected to use df_index_id_name, df_index_timestamp_name, and has df_asset_id_column

            # TODO potential name conflict among sensor/asset types, need to validate and reject when it happens

            # rename columns
            renamed_columns = {}
            for idx, name in enumerate(data_items_copy):
                name_type, name = name.split(':')
                if name_type == entity_type:
                    renamed_columns[name] = names_copy[idx]
            # filter only actual renamed pairs
            renamed_columns = {k:v for k, v in renamed_columns.items() if k != v}
            if len(renamed_columns) > 0:
                self.logger.debug('renamed_columns=%s for entity_type=%s', renamed_columns, entity_type)
                entity_type_df = entity_type_df.rename(columns=renamed_columns)

            entity_type_df = entity_type_df.astype({df_index_timestamp_name: 'datetime64[ms]'})
            
            if is_asset_data:
                asset_dfs.append(entity_type_df) # append asset data 
            else :
                all_dfs.append(entity_type_df) # append sensor data

            self.df_traces['loaded_n_mapped_%s' % entity_type] = entity_type_df

        self.logger.debug('Finished iterating over entity types. Retrieved %s asset DataFrames and %s sensor DataFrames', len(asset_dfs), len(all_dfs))

        if len(all_dfs) > 0:
            self.logger.debug('Fixing index on sensor DF...')
            all_dfs = [d.set_index([self.df_asset_id_column, df_index_timestamp_name]) for d in all_dfs]
            all_dfs = [d.loc[~d.index.duplicated(keep='last')] for d in all_dfs]
            df1 = pd.concat(all_dfs, axis=1, sort=False) # This is sensor dataframe
            df1 = df1.reset_index().sort_values(df_index_timestamp_name)
            
        if len(asset_dfs) > 0:
            self.logger.debug('Fixing index on failure DF...')
            asset_dfs = [d.set_index([self.df_asset_id_column, df_index_timestamp_name]) for d in asset_dfs]
            asset_dfs = [d.loc[~d.index.duplicated(keep='last')] for d in asset_dfs]
            df2 = pd.concat(asset_dfs, axis=1, sort=False) # This is the failure dataframe
            df2 = df2.reset_index().sort_values(df_index_timestamp_name)
        
        tot_dfs=[]
        if len(all_dfs) == 0 and len(asset_dfs) == 0: # no data found
            self.logger.warn('No data found')
        elif len(all_dfs) == 0: # no sensor, only asset data. e.g degradation curve
            #merge asset data
            tot_dfs = asset_dfs
        elif len(asset_dfs) == 0: # no asset dat, only sensor data. e.g un-surpervised AD
            #merge sensor data
            tot_dfs = all_dfs
        else: #found both sensor and asset data, need to merge then 
            # We will loop thru asset_id's and merge sensor with failure rec for each asset
            tolerance_window = pd.Timedelta(timedelta(minutes=1440, seconds = 0, milliseconds=0, microseconds=0)) # 24 hours
            self.logger.debug('Iterating over assets to merge sensor and asset data...')
            for asset_id, df11 in df1.groupby(self.df_asset_id_column):
                self.logger.debug('Current iteration asset: %s', asset_id)
                df11 = df11.sort_values(df_index_timestamp_name)
                df22 = df2[df2[self.df_asset_id_column] == asset_id] 
                df22.drop(self.df_asset_id_column, axis=1, inplace=True)
                df22 = df22.sort_values(df_index_timestamp_name)
                dff = pd.merge_asof(df11,df22,on=df_index_timestamp_name,tolerance=tolerance_window,direction='forward')
                self.logger.debug('Merged DF for asset id=%s: dff=%s', asset_id, log_df_info(dff, head=0, logger=self.logger, log_level=logging.DEBUG))
                tot_dfs.append(dff)

        if len(tot_dfs) > 0:
            self.logger.debug('Merging %s different failure & sensor DataFrames', len(tot_dfs))
            df = pd.concat(tot_dfs) 
            self.df_traces['merged'] = df
            if df is not None:
                df = df.reset_index()
            self.logger.debug('Merged DataFrames: df_merged=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
            
        # join with asset dimensions
        if asset_dimension is not None:
            self.logger.debug('Merging new DF with asset_dimension DF...')
            if df is not None:
                self.logger.debug('asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))

                df = df.merge(asset_dimension, how='left', left_on=self.df_asset_id_column, right_on=self.df_asset_id_column, sort=False)
                self.df_traces['merged_with_asset_dimension'] = df
                self.logger.debug('Completed merge. New DF=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
            else:
                # only asset dimension data
                df = asset_dimension

        if df is None:
            df = pd.DataFrame(columns=[self.df_asset_id_column, df_index_timestamp_name])

        if df_index_timestamp_name not in df.columns:
            # special case that no time-series data is loaded, only dimensions
            # populate the time base column automatically, since 
            # this is just to conform to AS pipeline, this time base column is 
            # not going to be used
            df[df_index_timestamp_name] = pd.date_range(end=dt.datetime.utcnow(), periods=len(df), freq='T', tz='UTC')

        # sort by time before we do the merge
        df = df.set_index([self.df_asset_id_column, df_index_timestamp_name])
        to_drop = set(df.columns) - set(names_copy) - {self._entity_type._entity_id, self._entity_type._timestamp_col}
        df = df.drop(to_drop, axis=1).sort_index()
        self.df_traces['merged_sorted_n_column_dropped'] = df
        self.logger.debug('Sorted DF by time and dropped columns. df=%s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))

        # reorder the columns according to the original given list
        if set(names_copy).issubset(set(df.columns)):
            df = df[names_copy]
            self.logger.debug('Reordered columns according to config: df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # forward fill with last non-na window and dropna afterward, making sure we have values for all cells
        # NEW code bign
        # remove dropna
        """ if not df.empty: # only needed when there's some data
            if self.fillna is not None:
                if self.fillna_exclude is None:
                    df = df.groupby([self.df_asset_id_column], sort=False).apply(lambda x: x.fillna(method=self.fillna))
                else:
                    to_fillna = list(set(names_copy) - set(self.fillna_exclude))
                    df[to_fillna] = df.groupby([self.df_asset_id_column], sort=False)[to_fillna].apply(lambda x: x.fillna(method=self.fillna))
            if self.dropna is not None:
                if self.dropna_exclude is None:
                    df = df.dropna(how=self.dropna)
                else:
                    df = df.dropna(how=self.dropna, subset=list(set(names_copy) - set(self.dropna_exclude))) """

        #New Code end


        # rename index to the standard one
        df = df.rename_axis([df_index_id_name if label == self.df_asset_id_column else label for label in df.index.names])

        self.df_traces['output'] = df.copy()

        self.logger.debug('Finished executing AssetLoader. Final DF: %s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return df

    def _load_asset_device_mappings(self):
        self.logger.debug('Loading asset device mappings...')
        if self.asset_device_mappings is None or not isinstance(self.asset_device_mappings, dict) or len(self.asset_device_mappings) == 0:
            # note that left outer join from asset group table to asset device mapping tables is used to make sure 
            # in cases that either just asset data is used (hence no device) or no specific device mappings are 
            # created for an asset, those assets (in the group) are still being picked up

            # get asset group table
            asset_group_table = api._get_asset_group_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
 
            # get AHI asset device mappings
            ahi_mappings_table = api._get_asset_device_attribute_mapping_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
            query = sqlalchemy.select( asset_group_table.c['site'], asset_group_table.c['asset'], ahi_mappings_table.c['devicetype'], ahi_mappings_table.c['deviceid'] ).select_from( asset_group_table.join( ahi_mappings_table, (ahi_mappings_table.c['site'] == asset_group_table.c['site']) & (ahi_mappings_table.c['asset'] == asset_group_table.c['asset']), isouter=True ) ).where(asset_group_table.c['assetgroup'] == self.asset_group)
            self.logger.debug('Query AHI for asset device mappings: SQL Query=%s', query)

            db = api._get_db()
            with db.engine.connect() as conn:
                df_ahi_mappings = pd.read_sql(sql=query, con=conn)
            #print(df_ahi_mappings)
            self.logger.debug('Query results: df_ahi_mappings=%s', log_df_info(df_ahi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            df_ahi_mappings_2=df_ahi_mappings.copy()
            #df['bar'] = df['bar'].str.cat(df['foo'].values.astype(str), sep=' is ')
            df_ahi_mappings['assetfullid'] = df_ahi_mappings['asset'].str.cat(df_ahi_mappings_2['site'].values.astype(str),sep='-____-')

            #df_ahi_mappings['assetfullid'] = (df_ahi_mappings['asset'] + '-____-' + df_ahi_mappings['site'] ).astype(str)
            #print(df_pmi_mappings)
            #df_ahi_mappings['assetfullid'] = df_ahi_mappings['asset'].astype(str) + '-____-' + df_ahi_mappings['site'].astype(str)
            self.logger.debug('Update DF with asset full ID: df_ahi_mappings=%s', log_df_info(df_ahi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            # load PMI asset device mappings here to merge with AHI side
            pmi_mappings_table = api._get_asset_device_mapping_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
            query = sqlalchemy.select(
                asset_group_table.c['site'], 
                asset_group_table.c['asset'], 
                pmi_mappings_table.c['devicetype'], 
                pmi_mappings_table.c['deviceid']
            ).select_from(
                asset_group_table.join(
                    pmi_mappings_table, 
                    (pmi_mappings_table.c['site'] == asset_group_table.c['site']) & 
                    (pmi_mappings_table.c['asset'] == asset_group_table.c['asset']),
                    isouter=True
                )
            ).where(
                asset_group_table.c['assetgroup'] == self.asset_group
            )
            self.logger.debug('Query PMI for asset device mappings: SQL Query=%s', query)
            with db.engine.connect() as conn:
                df_pmi_mappings = pd.read_sql(sql=query, con=conn)
            #print(df_pmi_mappings)

            self.logger.debug('Query results:  df_pmi_mappings=%s', log_df_info(df_pmi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            df_pmi_mappings_2=df_pmi_mappings.copy()

            df_pmi_mappings['assetfullid'] = df_pmi_mappings['asset'].str.cat(df_pmi_mappings_2['site'].values.astype(str),sep='-____-')

            #df_pmi_mappings['assetfullid'] = (df_pmi_mappings['asset'] + '-____-' + df_pmi_mappings['site'] ).astype('string')
            #df_pmi_mappings['assetfullid'] = df_pmi_mappings['asset'].astype(str) + '-____-' + df_pmi_mappings['site'].astype(str)
            self.logger.debug('Updated DF with asset full ID: df_pmi_mappings=%s', log_df_info(df_pmi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            # merge them, with PMI as the base
            merged_mappings = defaultdict(set)
            for row in df_pmi_mappings.itertuples():
                assetfullid_string = row.asset+ '-____-'+ row.site
                #print('pmi assetfullid_string',assetfullid_string)
                #print(type(assetfullid_string))
                if pd.notna(row.devicetype) and pd.notna(row.deviceid):
                    
                    merged_mappings[assetfullid_string].add('%s:%s' % (row.devicetype, row.deviceid))

                    #merged_mappings[row.assetfullid].add('%s:%s' % (row.devicetype, row.deviceid))
                else:
                    # make sure the default set is created, but not adding anything
                    merged_mappings[assetfullid_string]
                    #merged_mappings[row.assetfullid]

            # with AHI extra devices added
            for row in df_ahi_mappings.itertuples():
                assetfullid_string = row.asset+ '-____-'+ row.site
                
                #print('ahi assetfullid_string',assetfullid_string)
                #print(type(assetfullid_string))
                if pd.notna(row.devicetype) and pd.notna(row.deviceid):
                    merged_mappings[assetfullid_string].add('%s:%s' % (row.devicetype, row.deviceid))
                    #merged_mappings[row.assetfullid].add('%s:%s' % (row.devicetype, row.deviceid))
                else:
                    # make sure the default set is created, but not adding anything
                    merged_mappings[assetfullid_string]
                    #merged_mappings[row.assetfullid]

            merged_mappings = {key: list(mappings) for key, mappings in merged_mappings.items()}

            self._set_asset_device_mappings(merged_mappings)

    def _set_asset_device_mappings(self, asset_device_mappings):
        self.asset_device_mappings, self.entity_type_meta = self._validate_mappings(asset_device_mappings)
        self.logger.debug('Set asset device mappings: input_asset_device_mappings=%s, asset_device_mappings=%s, entity_type_meta=%s', asset_device_mappings, self.asset_device_mappings, self.entity_type_meta)

    def set_data_substitution(self, entity_type, data, errors='ignore'):
        """Set entity type data substitution, including asset data.

        Parameters
        ----------
        entity_type : `str`
            The entity type name of which the data to bbe substituted. Must be a valid IoT entity type, 
            or an empty string representing asset data.
        data : `list` of `dict`
        errors : {'ignore', 'raise'}, optional
            Control raising ValueError on invalid `entity_type`.

            * raise : allow exceptions to be raised.
            * ignore : suppress exceptions. On error simply return.
        """

        if entity_type is None:
            raise ValueError('argument entity_type must not be None')

        if data is None:
            raise ValueError('argument data must not be None')

        # we need to run this here becuase execute() most like has not have been run yet 
        # and we need self.entity_type_meta for validation purpose
        self._load_asset_device_mappings()

        if entity_type not in self.entity_type_meta:
            if errors == 'raise':
                raise ValueError('invalid entity_type=%s' % entity_type)
            else:
                self.logger.warn('invalid entity_type=%s', entity_type)
                return

        # first merge all the give data substitution into two data frames, one for time-series and one for dimension

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp

        df_substitution_ts = None
        df_substitution_dim = None
        for substitution in data:
            df_data = substitution['df']
            keys = substitution['keys']
            timestamp = substitution.get('timestamp')
            columns = substitution.get('columns')
            rename_columns = substitution.get('rename_columns')
            parse_dates = substitution.get('parse_dates')

            if isinstance(keys, str): 
                keys = [keys]
            if not isinstance(keys, tuple) and not isinstance(keys, list):
                raise ValueError('error keys given for data_substitution["%s"]: %s' % (entity_type, str(substitution['keys'])))

            if timestamp is not None and not isinstance(timestamp, str):
                raise ValueError('error timestamp given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('timestamp'))))
            if columns is not None and not isinstance(columns, tuple) and not isinstance(columns, list):
                raise ValueError('error columns given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('columns'))))
            if rename_columns is not None and not isinstance(rename_columns, dict):
                raise ValueError('error rename_columns given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('rename_columns'))))
            if parse_dates is not None and (not isinstance(parse_dates, list) or not all([isinstance(attr, str) for attr in parse_dates])):
                raise ValueError('error parse_dates=%s, must be a list of string' % str(parse_dates))

            # make sure and convert keys to be of type str
            df_data = df_data.astype({key: str for key in keys})

            if columns is None:
                columns = set(df_data.columns) - set(keys)
                if timestamp is not None:
                    columns = columns - set([timestamp])
                columns = list(columns)

            remaining_columns = list(keys)
            if timestamp is not None:
                remaining_columns.append(timestamp)
            remaining_columns.extend(columns)
            df_data = df_data[remaining_columns]

            if rename_columns is not None:
                # asset group entity type always uses the default timestamp column name 'event_timestamp', 
                # so we have to remove it if users put timestamp it in rename_columns somehow
                if timestamp in rename_columns:
                    del rename_columns[timestamp]
                if parse_dates is not None:
                    parse_dates = [rename_columns[col] if col in rename_columns else col for col in parse_dates]
                df_data = df_data.rename(columns=rename_columns)

            keys = keys.copy() # don't touch the given array
            original_key_0 = keys[0]
            keys[0] = df_index_id_name if entity_type != self.ASSET_TYPE else self.df_asset_id_column
            df_data = df_data.rename(columns={original_key_0: keys[0]})

            if parse_dates is not None:
                for col in parse_dates:
                    if col in df_data.columns:
                        df_data[col] = pd.to_datetime(df_data[col], utc=True)

            if timestamp is not None:
                # time-series

                # first rename/normalize timestamp column name
                df_data = df_data.rename(columns={timestamp: df_index_timestamp_name})
                df_data[df_index_timestamp_name] = pd.to_datetime(df_data[df_index_timestamp_name], utc=True).dt.tz_localize(None)
                timestamp = df_index_timestamp_name

                all_keys = list(keys)
                all_keys.append(timestamp)
                if df_substitution_ts is None:
                    df_substitution_ts = df_data
                else:
                    df_substitution_ts = df_substitution_ts.merge(df_data, left_on=all_keys, right_on=all_keys, how='outer')
            else:
                # dimension
                if df_substitution_dim is None:
                    df_substitution_dim = df_data
                else:
                    df_substitution_dim = df_substitution_dim.merge(df_data, left_on=keys, right_on=keys, how='outer')

        if df_substitution_dim is not None:
            df_substitution_dim = df_substitution_dim.rename(columns={df_substitution_dim.columns[0]: self.df_asset_id_column})

        # check if the given data substituion includes all data items used for this sensor entity type (not applicable to assets)
        if entity_type != self.ASSET_TYPE:
            if not set(df_substitution_ts.columns).issuperset(set(self.entity_type_meta[entity_type])):
                raise ValueError('data substitution (%s) for entity_type=%s must contain all columns (%s) used in the pipeline' % (list(df_substitution_ts.columns), entity_type, self.entity_type_meta[entity_type]))

        self.logger.debug('df_substitution_ts=%s', log_df_info(df_substitution_ts))
        self.logger.debug('df_substitution_dim=%s', log_df_info(df_substitution_dim))
        if entity_type == self.ASSET_TYPE:
            self.data_substitution[entity_type] = [df_substitution_ts, df_substitution_dim]
        else:
            # non-asset entity types do not have dimensions
            self.data_substitution[entity_type] = df_substitution_ts

    def clear_data_substitution(self):
        self.data_substitution = {}

    def _validate_mappings(self, asset_device_mappings):
        '''Cross-check data_items (features) and asset_device_mappings to see if any invalid entity types used.

        Invalid entity types means either there's no actual asset-device mapping serves some specified features, or 
        there's no features needed for some given asset-device mappings. These invalid entity types are logged and 
        simply ignored.

        Returns: A dict of entity type to its data items required to be loaded.
        '''

        self.logger.debug('Validating asset device mappings...')

        if asset_device_mappings is None:
            return dict(), dict()

        # AS internall assume and transform id to string
        asset_device_mappings = {str(k):v for k, v in asset_device_mappings.items()}

        # fitlter out invalid device types
        need_validation = not api.is_local_mode()
        #all_entity_types = set(self.entity_type_metadata.keys()) if need_validation else set()
        all_entity_types = set([ent_type[1].get('name','') for ent_type in list(self.entity_type_metadata.items()) if isinstance(ent_type[1],dict)]) if need_validation else set()
        for asset_id, device_ids in asset_device_mappings.items():
            valid_device_ids = []
            for device_id in device_ids:
                device_type, did = device_id.split(':')
                if not need_validation or device_type in all_entity_types:
                    valid_device_ids.append(device_id)
                else:
                    self.logger.warning('ignore invalid device=%s in asset-device mappings', device_id)
            asset_device_mappings[asset_id] = valid_device_ids
        self.logger.debug('Generated asset_device_mappings=%s', asset_device_mappings)

        entity_type_meta = dict()

        # create a dict of entity types to their data items used
        features_meta = dict()
        for idx, item in enumerate(self.data_items):
            name_type, name = item.split(':')
            features_meta.setdefault(name_type, set()).add(name)
        self.logger.debug('Generated device type to features: features_meta=%s', features_meta)

        mappings_meta = set()
        for asset_id, device_ids in asset_device_mappings.items():
            for device_id in device_ids:
                entity_type, entity_id = device_id.split(':')
                mappings_meta.add(entity_type)
        self.logger.debug('Generated mappings_meta=%s', mappings_meta)

        missing_feature_entity_types = set(features_meta.keys()) - mappings_meta - set([self.ASSET_TYPE])
        if len(missing_feature_entity_types) > 0:
            msg = 'cannot find any device of feature_entity_type=%s associated with assets of asset_group_id=%s, therefore there is no data for those features, please check if those asset-device mappings are set correctly, either from UI or by calling function pmlib.set_asset_device_mappings()' % (str(list(missing_feature_entity_types)), self.asset_group)
            self.logger.error(msg)
            raise RuntimeError(msg)

        for entity_type in (mappings_meta - set(features_meta.keys())):
            self.logger.warning('ignore unused entity_type=%s in asset_device_mappings, not used in data_items', entity_type)

        for entity_type in (set(features_meta.keys()) & mappings_meta):
            entity_type_meta[entity_type] = list(features_meta[entity_type])
        if self.ASSET_TYPE in features_meta:
            entity_type_meta[self.ASSET_TYPE] = list(features_meta[self.ASSET_TYPE])

        return (asset_device_mappings, entity_type_meta)
    
    def _validate_data_items(self):
        if api.is_local_mode():
            self.logger.debug('local_mode=True, all_entity_types=set()')
            return

        all_entity_types = set([ent_type[1].get('name','') for ent_type in list(self.entity_type_metadata.items()) if isinstance(ent_type[1],dict)])
        
        self.logger.debug('Retrieved entity types from Monitor. all_entity_types=%s', all_entity_types)

        for idx, item in enumerate(self.data_items):
            name_type, name = item.split(':')
            
            if name_type == self.ASSET_TYPE:
                # asset data
                continue
            entity_type_instance = api.get_entity_from_metadata(self.entity_type_metadata, name_type)
            if entity_type_instance is None:
                raise ValueError('"%s" has invalid entity type (case-sensitive)' % item)

            #data_item_names = {dto['name'].lower() for dto in self.entity_type_metadata[api.get_entity_from_metadata(self.entity_type_metadata, device_type)['entityTypeId']]['dataItemDto']}
            data_item_names = {dto['name'].lower() for dto in entity_type_instance['dataItemDto']}
            if name.lower() not in data_item_names:
                self.logger.warning('name=%s not found in entity_type=%s data_items=%s', name, name_type, data_item_names)
                raise ValueError('"%s" is an invalid data item (case-sensitive)' % item)



class AggregatedAssetLoader(_BaseTransformer):

    is_data_source = True
    """Asset data loader is a data source in AS.

    Data source functions are executed at the beginning stage of AS pipelines.
    """

    merge_method = 'replace'
    """Asset data loader is primary data source which always replaces upstream dataframe with its own.

    Using replace merge method means our asset data loader is the one responsible for what should be the 
    dataframe passed to downstream. Internally, our asset data loader does all the necessary merging and 
    joining, between the input upstream dataframe and loaded asset data. This merge method makes sure 
    AS does not intervene with this logic.
    """

    ASSET_TYPE = ''

    def __init__(self, asset_group, _entity_type, data_items, names, resamples, entity_type_metadata, asset_device_mappings, 
                 fillna='ffill', fillna_exclude=None, dropna='any', dropna_exclude=None, use_cm=False,cm_code='CM'):
        super().__init__()
        self.asset_group = asset_group
        self._entity_type = _entity_type

        self.entity_type_metadata = entity_type_metadata

        self._init_data_items(data_items, names)

        if resamples is None:
            resamples = dict()
        self.resamples = resamples

        # give highest priority to mappings from parameter which is assumed to beb used 
        # only by testing purpose (offline usage), this essentially stops the mapping 
        # loading from local cache (of remote system)
        self._set_asset_device_mappings(asset_device_mappings)

        self.fillna = fillna
        self.fillna_exclude = fillna_exclude
        self.dropna = dropna
        self.dropna_exclude = dropna_exclude

        self.data_substitution = {}

        self.df_asset_id_column = 'asset_id'

        self.use_cm = use_cm
        self.cm_code = cm_code

    def _init_data_items(self, data_items, names, skip_validation=False):
        if not isinstance(data_items, list) or not isinstance(names, list) or len(data_items) != len(names):
            raise ValueError('arguments data_items and names must be both list and of same length')

        self.data_items = data_items
        self.names = names

        # validate the given data_items to see if any invalid entity type
        if skip_validation is not True:
            self._validate_data_items()

    def execute(self, df=None, start_ts=None, end_ts=None, entities=None):
        self.df_traces = dict()

        self.logger.debug('begin of AggregatedAssetLoader start_ts=%s, end_ts=%s, entities=%s, df_input=%s', start_ts, end_ts, entities, log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        if df is not None and df.empty:
            df = None

        # mapping loading is deferred to execute time but not init time
        self._load_asset_device_mappings()

        if len(self.asset_device_mappings) == 0:
            # empty mapping, we can't do anything without mapping
            if api.is_local_mode():
                raise ValueError('cannot find any asset_device_mappings, parameters asset_device_mappings must be given to create pipeline objects when in local mode')
            else:
                raise ValueError('cannot find any asset_device_mappings, add some from UI or by using function pmlib.set_asset_device_mappings()')

        data_items_copy = self.data_items.copy()
        names_copy = self.names.copy()

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp
        df_entity_type_column = 'entity_type'

        # construct the asset-device mapping dataframe
        entity_types_used = set()
        df_mappings = pd.DataFrame(columns=[self.df_asset_id_column, df_entity_type_column, df_index_id_name])
        for asset_id, device_ids in self.asset_device_mappings.items():
            if len(device_ids) == 0:
                # for assets without sensors associated, still keep it so we can load asset data
                df_mappings = pd.concat([df_mappings, pd.DataFrame([{self.df_asset_id_column: asset_id}])], ignore_index=True)
                continue

            for device_id in device_ids:
                entity_type, entity_id = device_id.split(':')
                df_mappings = pd.concat([df_mappings, pd.DataFrame([{self.df_asset_id_column: asset_id, df_entity_type_column: entity_type, df_index_id_name: entity_id}])], ignore_index=True)
                entity_types_used.add(entity_type)
        self.df_traces['mappings'] = df_mappings
        self.logger.debug('df_mappings=%s', log_df_info(df_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

        # now loop through all the feature used types, including asset data

        '''
        For time series data frame, IOT entity types loaded ones have id column 'deviceid' and corresponding entity type's 
        timestamp column, with two new columns added: 'entityt_type' and 'asset_id'. Timestamp columns are renamed to be 
        'event_timestamp'. For asset time series data frames, they should have id column named as 'asset_id' and timestamp 
        column named 'event_timestamp'. Methods calling Maximo API to generate asset data frames already conform to this. 
        For table loaded data frames, make sure they are renamed properly. Similarly for non-time-series asset data, for 
        the 'asset_id' id column. This is required in order to merge/join all the data frame together at the end.

        One caveat is that iotfunctions automatically adds 'id' and '_timestamp' columns if data is loaded by it.
        '''

        all_dfs = []
        asset_dfs =[]
        asset_dimension = None # for keeping asset metadata dimensions to be joined after all others are merged
        for entity_type, data_items in self.entity_type_meta.items():
            is_asset_data = entity_type == self.ASSET_TYPE

            self.logger.debug('entity_type=%s, data_items=%s, is_asset_data=%s', entity_type, data_items, is_asset_data)

            if entity_type not in entity_types_used and not is_asset_data:
                # entity types without actual association with assets are no use since 
                # everything starts with assets, just ignore them
                continue

            entity_type_df = None

            if is_asset_data:
                if entity_type in self.data_substitution:
                    self.logger.debug('using substitution for asset')

                    entity_type_df, asset_dimension = self.data_substitution[entity_type]

                    # filter by time range, if given
                    if start_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] >= pd.to_datetime(start_ts)]
                    if end_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] < pd.to_datetime(end_ts)]

                    # make siteid and assetnum all upper case since Maximo does that
                    if entity_type_df is not None and not entity_type_df.empty:
                        entity_type_df[self.df_asset_id_column] = entity_type_df[self.df_asset_id_column].str.upper()
                    if asset_dimension is not None and not asset_dimension.empty:
                        asset_dimension[self.df_asset_id_column] = asset_dimension[self.df_asset_id_column].str.upper()

                    self.df_traces['loaded_asset_ts'] = entity_type_df
                    self.logger.debug('df_loaded_asset_ts=%s', log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

                    self.df_traces['loaded_asset_dimension'] = asset_dimension
                    self.logger.debug('df_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))
                else:
                    # need to separate asset time-series data from non-time-series data, because non-time-series need to be 
                    # joined at the very end after all other asset/device time-series data are merged
                    self.logger.debug('in AssetLoader data_items=%s', data_items)
                    ts_data_items = set(data_items) & {'faildate', 'failurecode', 'problemcode','causecode','remedycode'}
                    self.logger.debug('in AssetLoader ts_data_items=%s', ts_data_items)

                    non_ts_data_items = set(data_items) - ts_data_items
                    self.logger.debug('ts_data_items=%s, non_ts_data_items=%s', ts_data_items, non_ts_data_items)

                    # when maximo is linked and the asset group is actually present in maximo, load asset data from
                    # maximo directly. otherwise, load asset data from the special internal asset entity type

                    # TODO can we avoid this asset group mapping API call?
                    if api.get_maximo_asset_device_mappings(self.asset_group) is not None:
                        # first prepare list of assets in Maximo non-composite form
                        maximo_assets = []
                        for asset_id, mapping in self.asset_device_mappings.items():
                            asset_num, site_id = asset_id.split('-____-')
                            maximo_assets.append({"assetNum":asset_num, "siteId": site_id})
                        
                        if len(maximo_assets) > 0:
                            # time-series
                            if len(ts_data_items):
                                entity_type_df = api.get_asset_failure_history(assets=maximo_assets, data_items=list(ts_data_items), df_id_column=self.df_asset_id_column, df_timestamp_name=df_index_timestamp_name, start_ts=start_ts, end_ts=end_ts)
                                self.df_traces['loaded_asset_ts'] = entity_type_df
                                self.logger.debug('df_loaded_asset_ts=%s', log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))
                            else:
                                entity_type_df = None

                            # non-time-series dimensions
                            if len(non_ts_data_items) > 0:
                                asset_dimension = api.get_asset_attributes(assets=maximo_assets, data_items=list(non_ts_data_items), df_id_column=self.df_asset_id_column)
                                self.df_traces['loaded_asset_dimension'] = asset_dimension
                                self.logger.debug('df_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))
                    else:
                        # get the special internal asset entity type for loading asset data (from table)
                        asset_cache_entity_type = api._get_asset_cache_entity_type()

                        # time-series

                        if len(ts_data_items):
                            entity_type_df = asset_cache_entity_type.db.read_table(
                                table_name=asset_cache_entity_type.name,
                                schema=asset_cache_entity_type._db_schema,
                                timestamp_col=asset_cache_entity_type._timestamp,
                                parse_dates=None,
                                columns=[asset_cache_entity_type._entity_id, asset_cache_entity_type._timestamp] + list(ts_data_items),
                                start_ts=start_ts,
                                end_ts=end_ts,
                                entities=list(self.asset_device_mappings.keys()),
                                dimension=None
                            )

                            # rename id and timestamp columns to be the same ones so we can concat later
                            entity_type_df = entity_type_df.rename(columns={
                                asset_cache_entity_type._entity_id: self.df_asset_id_column,
                                asset_cache_entity_type._timestamp: df_index_timestamp_name,
                            })

                            self.df_traces['local_loaded_asset_ts'] = entity_type_df
                            self.logger.debug('df_local_loaded_asset_ts=%s', log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))
                        else:
                            entity_type_df = None

                        # non-time-series dimensions

                        if len(non_ts_data_items) > 0:
                            asset_dimension = asset_cache_entity_type.db.read_table(
                                table_name=asset_cache_entity_type._dimension_table_name,
                                schema=asset_cache_entity_type._db_schema,
                                columns=[asset_cache_entity_type._entity_id] + list(non_ts_data_items),
                                entities=list(self.asset_device_mappings.keys())
                            )

                            # rename id column to be the same one so we can concat later
                            asset_dimension = asset_dimension.rename(columns={
                                asset_cache_entity_type._entity_id: self.df_asset_id_column,
                            })

                            self.df_traces['local_loaded_asset_dimension'] = asset_dimension
                            self.logger.debug('df_local_loaded_asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))

                # TODO what to do when no asset data is loaded?

                if entity_type_df is None:
                    continue
            else:
                if entity_type in self.data_substitution:
                    self.logger.debug('using substitution for entity_type=%s', entity_type)

                    entity_type_df = self.data_substitution[entity_type]

                    # filter by time range, if given
                    if start_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] >= pd.to_datetime(start_ts)]
                    if end_ts is not None:
                        entity_type_df = entity_type_df[entity_type_df[self._entity_type._timestamp] < pd.to_datetime(end_ts)]
                else:
                    entity_type_service = api.get_entity_type(entity_type, db=self._entity_type.db)

                    # always load as all-lower-case column names since sqlalchemy is tricky in some cases
                    # we'll rename columns back to correct cases at the end (after merged)
                    columns_to_load = [col for col in data_items]

                    # figure out if resampling is configured, and if yes, change data_items/names accordingly
                    time_grain, agg_methods, agg_outputs = None, None, None
                    if self.resamples.get(entity_type, None) is not None:
                        time_grain, agg_methods, agg_outputs = self.resamples[entity_type]
                        data_items = [v for k, vv in agg_outputs.items() for v in vv]

                        # substitute the original feature names with the resampled features
                        original_featur_names = [d.split(':')[1] for d in data_items_copy if d.split(':')[0] == entity_type]
                        for idx, feature in enumerate(original_featur_names):
                            original_idx = data_items_copy.index('%s:%s' % (entity_type, feature))
                            data_items_copy[original_idx:original_idx+1] = ['%s:%s' % (entity_type, f) for f in agg_outputs[feature]]
                            names_copy[original_idx:original_idx+1] = agg_outputs[feature]

                        self.logger.debug('after resampled feature substitution, data_items_copy=%s, names_copy=%s', data_items_copy, names_copy)

                    self.logger.debug('AggregatedAssetLoader before get_data: start_ts=%s, end_ts=%s, entity_type=%s, columns_to_load=%s, time_grain=%s, agg_methods=%s, agg_outputs=%s', start_ts, end_ts, entity_type, columns_to_load, time_grain, agg_methods, agg_outputs)
                    try:
                        #entity_type_df = api.get_entity_type_data(entity_type=entity_type_service, start_ts=start_ts, end_ts=end_ts, entities=entities, data_items=columns_to_load, time_grain=time_grain, agg_methods=agg_methods, agg_outputs=agg_outputs)

                        db = api._get_db(asset_group_id=self.asset_group)
                        
                        entity_type_id=api.get_entity_type_id(entity_type)
                        self.logger.debug('entity_type_id =%s', entity_type_id)
                        #table_name= "DM_DEVICE_TYPE_" + str(entity_type_id) + "_1"
                        #self.logger.debug('Aggregated table name=%s', table_name)

                        # In Custom env, if you create minute level aggregate data, then you need the below code
                        table_name=api.get_table_name_for_aggregate_data(entity_type, columns_to_load)
                        self.logger.debug('api.get_table_name_for_aggregate_data Aggregated table name=%s', table_name)

                        table = db.get_table(table_name, schema=os.environ['AS_SCHEMA']) 
                        
                        
                        
                        #from datetime import timedelta
                        #start_ts = dt.datetime.utcnow() - dt.timedelta(days=500)
                        #end_ts = dt.datetime.utcnow()
                        df_t = db.read_table(table, None, columns=['entity_id', 'KEY', 'value_n', 'TIMESTAMP'], timestamp_col="timestamp", start_ts=start_ts, end_ts=end_ts)
                        #df_t.head()
                        self.logger.debug('in the new code  of AggregatedAssetLoader start_ts=%s, end_ts=%s, entities=%s, df_t=%s', start_ts, end_ts, entities, log_df_info(df_t, head=5, logger=self.logger, log_level=logging.DEBUG))
                        data_frames=[]
                        key_list=df_t['KEY'].unique()
                        #print(key_list)
                        data_df=pd.DataFrame({'entity_id':[],'TIMESTAMP':[]})

                        
                        for key in key_list:
                            each_df= df_t[df_t['KEY'] == key]
                            each_df.rename(columns={'value_n':key},inplace=True)
                            each_df.drop(['KEY'],axis=1,inplace=True)
                            data_df= pd.merge(data_df, each_df, on=['entity_id','TIMESTAMP'], how='outer')
                        self.logger.info('in the new code  of AggregatedAssetLoader after load start_ts=%s, end_ts=%s, entities=%s, data_df=%s', start_ts, end_ts, entities, log_df_info(data_df, head=5, logger=self.logger, log_level=logging.DEBUG))
                        
                        entity_type_df = data_df

                        #entity_type_df = entity_type_df.reset_index()
                    except TypeError as err:
                        if str(err) == "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'":
                            # TODO iotfunctions.util.resample has an defect when the read dataframe is empty
                            entity_type_df = pd.DataFrame(columns=[df_index_id_name, entity_type_service._timestamp] + data_items)
                        else:
                            raise RuntimeError('error getting data') from err
                    except BaseException as err:
                        raise RuntimeError('error getting data') from err

                    self.df_traces['loaded_%s_before_dropna_rename' % entity_type] = entity_type_df
                        
                    self.logger.debug('in the new code  of AggregatedAssetLoader  after get_data: df_loaded_%s=%s', entity_type, log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

                    # drop those rows with NA values for all data items to be loaded
                    
                    #print('Kewei data_items=',str(data_items))
                    if entity_type_df.shape[0] != 0 :
                        entity_type_df = entity_type_df.dropna(how='all', subset=data_items)

                    # rename timestamp column to be the one used by the asset group entity type so we can concat later
                    entity_type_df = entity_type_df.rename(columns={
                        'TIMESTAMP': df_index_timestamp_name
                    })

                #self.df_traces['loaded_%s' % entity_type] = entity_type_df
                

                # force set the entity type column
                entity_type_df[df_entity_type_column] = entity_type
                
                # kewei rename entity_id to df_index_id_name=id
                entity_type_df = entity_type_df.rename(columns={
                        'entity_id': df_index_id_name
                    })
                self.logger.debug('after rename df_loaded_%s=%s', entity_type, log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

                # now merge with mappings to get asset_id column
                entity_type_df = pd.merge(left=entity_type_df, right=df_mappings, how='inner', on=[df_entity_type_column, df_index_id_name], sort=False)

            # at this point, entity_type_df is expected to use df_index_id_name, df_index_timestamp_name, and has df_asset_id_column

            # TODO potential name conflict among sensor/asset types, need to validate and reject when it happens

            # rename columns
            renamed_columns = {}
            for idx, name in enumerate(data_items_copy):
                name_type, name = name.split(':')
                if name_type == entity_type:
                    renamed_columns[name] = names_copy[idx]
            # filter only actual renamed pairs
            renamed_columns = {k:v for k, v in renamed_columns.items() if k != v}
            if len(renamed_columns) > 0:
                self.logger.debug('renamed_columns=%s for entity_type=%s', renamed_columns, entity_type)
                entity_type_df = entity_type_df.rename(columns=renamed_columns)

            entity_type_df = entity_type_df.astype({df_index_timestamp_name: 'datetime64[ms]'})
            
            #all_dfs.append(entity_type_df)

            #Added by Kewei
            if is_asset_data:
                asset_dfs.append(entity_type_df) # append asset data 
            else :
                all_dfs.append(entity_type_df) # append sensor data

            

            self.df_traces['loaded_n_mapped_%s' % entity_type] = entity_type_df
            self.logger.debug('df_loaded_n_mapped_%s=%s', entity_type, log_df_info(entity_type_df, head=5, logger=self.logger, log_level=logging.DEBUG))

        self.logger.debug('before merge, df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))








        #if len(all_dfs) > 0:
        #    all_dfs = [d.set_index([self.df_asset_id_column, df_index_timestamp_name]) for d in all_dfs]
        #    all_dfs = [d.loc[~d.index.duplicated(keep='last')] for d in all_dfs]
        #   df = pd.concat(all_dfs, axis=1, sort=False)
        #    self.df_traces['merged'] = df
        #    self.logger.debug('df_merged=%s', log_df_info(df, head=-1))
        #    df = df.reset_index()










        self.logger.debug('Finished iterating over entity types. Retrieved %s asset DataFrames and %s sensor DataFrames', len(asset_dfs), len(all_dfs))

        if len(all_dfs) > 0:
            self.logger.debug('Fixing index on sensor DF...')
            all_dfs = [d.set_index([self.df_asset_id_column, df_index_timestamp_name]) for d in all_dfs]
            all_dfs = [d.loc[~d.index.duplicated(keep='last')] for d in all_dfs]
            df1 = pd.concat(all_dfs, axis=1, sort=False) # This is sensor dataframe
            df1 = df1.reset_index().sort_values(df_index_timestamp_name)
            
        if len(asset_dfs) > 0:
            self.logger.debug('Fixing index on failure DF...')
            asset_dfs = [d.set_index([self.df_asset_id_column, df_index_timestamp_name]) for d in asset_dfs]
            asset_dfs = [d.loc[~d.index.duplicated(keep='last')] for d in asset_dfs]
            df2 = pd.concat(asset_dfs, axis=1, sort=False) # This is the failure dataframe
            df2 = df2.reset_index().sort_values(df_index_timestamp_name)
        
        tot_dfs=[]
        if len(all_dfs) == 0 and len(asset_dfs) == 0: # no data found
            self.logger.warn('No data found')
        elif len(all_dfs) == 0: # no sensor, only asset data. e.g degradation curve
            #merge asset data
            tot_dfs = asset_dfs
        elif len(asset_dfs) == 0: # no asset dat, only sensor data. e.g un-surpervised AD
            #merge sensor data
            tot_dfs = all_dfs
        else: #found both sensor and asset data, need to merge then 
            # We will loop thru asset_id's and merge sensor with failure rec for each asset
            tolerance_window = pd.Timedelta(timedelta(minutes=1440, seconds = 0, milliseconds=0, microseconds=0)) # 24 hours
            self.logger.debug('Iterating over assets to merge sensor and asset data...')
            for asset_id, df11 in df1.groupby(self.df_asset_id_column):
                self.logger.debug('Current iteration asset: %s', asset_id)
                df11 = df11.sort_values(df_index_timestamp_name)
                df22 = df2[df2[self.df_asset_id_column] == asset_id] 
                df22.drop(self.df_asset_id_column, axis=1, inplace=True)
                df22 = df22.sort_values(df_index_timestamp_name)
                dff = pd.merge_asof(df11,df22,on=df_index_timestamp_name,tolerance=tolerance_window,direction='forward')
                self.logger.debug('Merged DF for asset id=%s: dff=%s', asset_id, log_df_info(dff, head=0, logger=self.logger, log_level=logging.DEBUG))
                tot_dfs.append(dff)

        if len(tot_dfs) > 0:
            self.logger.debug('Merging %s different failure & sensor DataFrames', len(tot_dfs))
            df = pd.concat(tot_dfs) 
            self.df_traces['merged'] = df
            if df is not None:
                df = df.reset_index()
            self.logger.debug('Merged DataFrames: df_merged=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))  

        # join with asset dimensions
        if asset_dimension is not None:
            if df is not None:
                self.logger.debug('df_merge=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
                self.logger.debug('asset_dimension=%s', log_df_info(asset_dimension, head=5, logger=self.logger, log_level=logging.DEBUG))

                df = df.merge(asset_dimension, how='left', left_on=self.df_asset_id_column, right_on=self.df_asset_id_column, sort=False)
                self.df_traces['merged_with_asset_dimension'] = df
                self.logger.debug('df_merged_with_asset_dimension=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
            else:
                # only asset dimension data
                df = asset_dimension

        if df is None:
            df = pd.DataFrame(columns=[self.df_asset_id_column, df_index_timestamp_name])

        self.logger.debug('before set_index, df=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        if df_index_timestamp_name not in df.columns:
            # special case that no time-series data is loaded, only dimensions
            # populate the time base column automatically, since 
            # this is just to conform to AS pipeline, this time base column is 
            # not going to be used
            df[df_index_timestamp_name] = pd.date_range(end=dt.datetime.utcnow(), periods=len(df), freq='T', tz='UTC')

        # sort by time before we do the merge
        df = df.set_index([self.df_asset_id_column, df_index_timestamp_name])
        to_drop = set(df.columns) - set(names_copy) - {self._entity_type._entity_id, self._entity_type._timestamp_col}
        df = df.drop(to_drop, axis=1).sort_index()
        self.df_traces['merged_sorted_n_column_dropped'] = df
        self.logger.debug('df_merged_sorted_n_column_dropped=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        # reorder the columns according to the original given list
        if df.shape[0] != 0:
            df = df[names_copy]
        self.logger.debug('df_col_reordered=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        self.logger.debug('fillna=%s, fillna_exclude=%s, dropna=%s, dropna_exclude=%s', self.fillna, self.fillna_exclude, self.dropna, self.dropna_exclude)

        # forward fill with last non-na window and dropna afterward, making sure we have values for all cells
        # NEW code bign
        # remove dropna
        """ if not df.empty: # only needed when there's some data
            if self.fillna is not None:
                if self.fillna_exclude is None:
                    df = df.groupby([self.df_asset_id_column], sort=False).apply(lambda x: x.fillna(method=self.fillna))
                else:
                    to_fillna = list(set(names_copy) - set(self.fillna_exclude))
                    df[to_fillna] = df.groupby([self.df_asset_id_column], sort=False)[to_fillna].apply(lambda x: x.fillna(method=self.fillna))

            if self.dropna is not None:
                if self.dropna_exclude is None:
                    df = df.dropna(how=self.dropna)
                else:
                    df = df.dropna(how=self.dropna, subset=list(set(names_copy) - set(self.dropna_exclude))) """

        #New Code end


        # rename index to the standard one
        df = df.rename_axis([df_index_id_name if label == self.df_asset_id_column else label for label in df.index.names])

        self.df_traces['output'] = df.copy()

        self.logger.debug('df_final=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))
        
        return df

    def _load_asset_device_mappings(self):
        if self.asset_device_mappings is None or not isinstance(self.asset_device_mappings, dict) or len(self.asset_device_mappings) == 0:
            # note that left outer join from asset group table to asset device mapping tables is used to make sure 
            # in cases that either just asset data is used (hence no device) or no specific device mappings are 
            # created for an asset, those assets (in the group) are still being picked up

            # get asset group table
            asset_group_table = api._get_asset_group_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
 
            # get AHI asset device mappings
            ahi_mappings_table = api._get_asset_device_attribute_mapping_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
            query = sqlalchemy.select([asset_group_table.c['site'], asset_group_table.c['asset'], ahi_mappings_table.c['devicetype'], ahi_mappings_table.c['deviceid']]).select_from(asset_group_table.join(ahi_mappings_table, sqlalchemy.and_(ahi_mappings_table.c['site'] == asset_group_table.c['site'], ahi_mappings_table.c['asset'] == asset_group_table.c['asset']), isouter=True)).where(asset_group_table.c['assetgroup'] == self.asset_group)
            self.logger.debug('sql=%s', query)
            df_ahi_mappings = pd.read_sql(sql=query, con=self._entity_type.db.connection)
            #print(df_ahi_mappings)
            self.logger.debug('1. before df_pmi_mappings=%s', log_df_info(df_ahi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))
            self.logger.debug('2. df_pmi_mappings type=%s', df_ahi_mappings.dtypes)

            df_ahi_mappings_2=df_ahi_mappings.copy()
            #df['bar'] = df['bar'].str.cat(df['foo'].values.astype(str), sep=' is ')
            df_ahi_mappings['assetfullid'] = df_ahi_mappings['asset'].str.cat(df_ahi_mappings_2['site'].values.astype(str),sep='-____-')

            #df_ahi_mappings['assetfullid'] = (df_ahi_mappings['asset'] + '-____-' + df_ahi_mappings['site'] ).astype(str)
            #print(df_pmi_mappings)
            #df_ahi_mappings['assetfullid'] = df_ahi_mappings['asset'].astype(str) + '-____-' + df_ahi_mappings['site'].astype(str)
            self.logger.debug('gate3: get the mapping of asset and device, df_ahi_mappings=%s', log_df_info(df_ahi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            # load PMI asset device mappings here to merge with AHI side
            pmi_mappings_table = api._get_asset_device_mapping_table(db=self._entity_type.db, db_schema=self._entity_type._db_schema)
            query = sqlalchemy.select([asset_group_table.c['site'], asset_group_table.c['asset'], pmi_mappings_table.c['devicetype'], pmi_mappings_table.c['deviceid']]).select_from(asset_group_table.join(pmi_mappings_table, sqlalchemy.and_(pmi_mappings_table.c['site'] == asset_group_table.c['site'], pmi_mappings_table.c['asset'] == asset_group_table.c['asset']), isouter=True)).where(asset_group_table.c['assetgroup'] == self.asset_group)
            self.logger.debug('sql=%s', query)
            df_pmi_mappings = pd.read_sql(sql=query, con=self._entity_type.db.connection)
            #print(df_pmi_mappings)

            self.logger.debug('3. before df_pmi_mappings=%s', log_df_info(df_pmi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))
            self.logger.debug('4. df_pmi_mappings type=%s', df_pmi_mappings.dtypes)

            df_pmi_mappings_2=df_pmi_mappings.copy()

            df_pmi_mappings['assetfullid'] = df_pmi_mappings['asset'].str.cat(df_pmi_mappings_2['site'].values.astype(str),sep='-____-')

            #df_pmi_mappings['assetfullid'] = (df_pmi_mappings['asset'] + '-____-' + df_pmi_mappings['site'] ).astype('string')
            #df_pmi_mappings['assetfullid'] = df_pmi_mappings['asset'].astype(str) + '-____-' + df_pmi_mappings['site'].astype(str)
            self.logger.debug('df_pmi_mappings=%s', log_df_info(df_pmi_mappings, head=5, logger=self.logger, log_level=logging.DEBUG))

            # merge them, with PMI as the base
            merged_mappings = defaultdict(set)
            for row in df_pmi_mappings.itertuples():
                assetfullid_string = row.asset+ '-____-'+ row.site
                #print('pmi assetfullid_string',assetfullid_string)
                #print(type(assetfullid_string))
                if pd.notna(row.devicetype) and pd.notna(row.deviceid):
                    
                    merged_mappings[assetfullid_string].add('%s:%s' % (row.devicetype, row.deviceid))

                    #merged_mappings[row.assetfullid].add('%s:%s' % (row.devicetype, row.deviceid))
                else:
                    # make sure the default set is created, but not adding anything
                    merged_mappings[assetfullid_string]
                    #merged_mappings[row.assetfullid]

            # with AHI extra devices added
            for row in df_ahi_mappings.itertuples():
                assetfullid_string = row.asset+ '-____-'+ row.site
                
                #print('ahi assetfullid_string',assetfullid_string)
                #print(type(assetfullid_string))
                if pd.notna(row.devicetype) and pd.notna(row.deviceid):
                    merged_mappings[assetfullid_string].add('%s:%s' % (row.devicetype, row.deviceid))
                    #merged_mappings[row.assetfullid].add('%s:%s' % (row.devicetype, row.deviceid))
                else:
                    # make sure the default set is created, but not adding anything
                    merged_mappings[assetfullid_string]
                    #merged_mappings[row.assetfullid]

            merged_mappings = {key: list(mappings) for key, mappings in merged_mappings.items()}

            self._set_asset_device_mappings(merged_mappings)

    def _set_asset_device_mappings(self, asset_device_mappings):
        self.asset_device_mappings, self.entity_type_meta = self._validate_mappings(asset_device_mappings)
        self.logger.info('input_asset_device_mappings=%s, asset_device_mappings=%s, entity_type_meta=%s', asset_device_mappings, self.asset_device_mappings, self.entity_type_meta)

    def set_data_substitution(self, entity_type, data, errors='ignore'):
        """Set entity type data substitution, including asset data.

        Parameters
        ----------
        entity_type : `str`
            The entity type name of which the data to bbe substituted. Must be a valid IoT entity type, 
            or an empty string representing asset data.
        data : `list` of `dict`
        errors : {'ignore', 'raise'}, optional
            Control raising ValueError on invalid `entity_type`.

            * raise : allow exceptions to be raised.
            * ignore : suppress exceptions. On error simply return.
        """

        if entity_type is None:
            raise ValueError('argument entity_type must not be None')

        if data is None:
            raise ValueError('argument data must not be None')

        # we need to run this here becuase execute() most like has not have been run yet 
        # and we need self.entity_type_meta for validation purpose
        self._load_asset_device_mappings()

        if entity_type not in self.entity_type_meta:
            if errors == 'raise':
                raise ValueError('invalid entity_type=%s' % entity_type)
            else:
                self.logger.warn('invalid entity_type=%s', entity_type)
                return

        # first merge all the give data substitution into two data frames, one for time-series and one for dimension

        df_index_id_name, df_index_timestamp_name = self._entity_type._df_index_entity_id, self._entity_type._timestamp

        df_substitution_ts = None
        df_substitution_dim = None
        for substitution in data:
            df_data = substitution['df']
            keys = substitution['keys']
            timestamp = substitution.get('timestamp')
            columns = substitution.get('columns')
            rename_columns = substitution.get('rename_columns')
            parse_dates = substitution.get('parse_dates')

            if isinstance(keys, str): 
                keys = [keys]
            if not isinstance(keys, tuple) and not isinstance(keys, list):
                raise ValueError('error keys given for data_substitution["%s"]: %s' % (entity_type, str(substitution['keys'])))

            if timestamp is not None and not isinstance(timestamp, str):
                raise ValueError('error timestamp given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('timestamp'))))
            if columns is not None and not isinstance(columns, tuple) and not isinstance(columns, list):
                raise ValueError('error columns given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('columns'))))
            if rename_columns is not None and not isinstance(rename_columns, dict):
                raise ValueError('error rename_columns given for data_substitution["%s"]: %s' % (entity_type, str(substitution.get('rename_columns'))))
            if parse_dates is not None and (not isinstance(parse_dates, list) or not all([isinstance(attr, str) for attr in parse_dates])):
                raise ValueError('error parse_dates=%s, must be a list of string' % str(parse_dates))

            # make sure and convert keys to be of type str
            df_data = df_data.astype({key: str for key in keys})

            if columns is None:
                columns = set(df_data.columns) - set(keys)
                if timestamp is not None:
                    columns = columns - set([timestamp])
                columns = list(columns)

            remaining_columns = list(keys)
            if timestamp is not None:
                remaining_columns.append(timestamp)
            remaining_columns.extend(columns)
            df_data = df_data[remaining_columns]

            if rename_columns is not None:
                # asset group entity type always uses the default timestamp column name 'event_timestamp', 
                # so we have to remove it if users put timestamp it in rename_columns somehow
                if timestamp in rename_columns:
                    del rename_columns[timestamp]
                if parse_dates is not None:
                    parse_dates = [rename_columns[col] if col in rename_columns else col for col in parse_dates]
                df_data = df_data.rename(columns=rename_columns)

            keys = keys.copy() # don't touch the given array
            original_key_0 = keys[0]
            keys[0] = df_index_id_name if entity_type != self.ASSET_TYPE else self.df_asset_id_column
            df_data = df_data.rename(columns={original_key_0: keys[0]})

            if parse_dates is not None:
                for col in parse_dates:
                    if col in df_data.columns:
                        df_data[col] = pd.to_datetime(df_data[col], utc=True)

            if timestamp is not None:
                # time-series

                # first rename/normalize timestamp column name
                df_data = df_data.rename(columns={timestamp: df_index_timestamp_name})
                df_data[df_index_timestamp_name] = pd.to_datetime(df_data[df_index_timestamp_name], utc=True).dt.tz_localize(None)
                timestamp = df_index_timestamp_name

                all_keys = list(keys)
                all_keys.append(timestamp)
                if df_substitution_ts is None:
                    df_substitution_ts = df_data
                else:
                    df_substitution_ts = df_substitution_ts.merge(df_data, left_on=all_keys, right_on=all_keys, how='outer')
            else:
                # dimension
                if df_substitution_dim is None:
                    df_substitution_dim = df_data
                else:
                    df_substitution_dim = df_substitution_dim.merge(df_data, left_on=keys, right_on=keys, how='outer')

        if df_substitution_dim is not None:
            df_substitution_dim = df_substitution_dim.rename(columns={df_substitution_dim.columns[0]: self.df_asset_id_column})

        # check if the given data substituion includes all data items used for this sensor entity type (not applicable to assets)
        if entity_type != self.ASSET_TYPE:
            if not set(df_substitution_ts.columns).issuperset(set(self.entity_type_meta[entity_type])):
                raise ValueError('data substitution (%s) for entity_type=%s must contain all columns (%s) used in the pipeline' % (list(df_substitution_ts.columns), entity_type, self.entity_type_meta[entity_type]))

        self.logger.debug('df_substitution_ts=%s', log_df_info(df_substitution_ts))
        self.logger.debug('df_substitution_dim=%s', log_df_info(df_substitution_dim))

        if entity_type == self.ASSET_TYPE:
            self.data_substitution[entity_type] = [df_substitution_ts, df_substitution_dim]
        else:
            # non-asset entity types do not have dimensions
            self.data_substitution[entity_type] = df_substitution_ts

    def clear_data_substitution(self):
        self.data_substitution = {}

    def _validate_mappings(self, asset_device_mappings):
        '''Cross-check data_items (features) and asset_device_mappings to see if any invalid entity types used.

        Invalid entity types means either there's no actual asset-device mapping serves some specified features, or 
        there's no features needed for some given asset-device mappings. These invalid entity types are logged and 
        simply ignored.

        Returns: A dict of entity type to its data items required to be loaded.
        '''

        if asset_device_mappings is None:
            return dict(), dict()

        # AS internall assume and transform id to string
        asset_device_mappings = {str(k):v for k, v in asset_device_mappings.items()}

        # fitlter out invalid device types
        need_validation = not api.is_local_mode()
        #all_entity_types = set(self.entity_type_metadata.keys()) if need_validation else set()
        all_entity_types = set([ent_type[1].get('name','') for ent_type in list(self.entity_type_metadata.items()) if isinstance(ent_type[1],dict)]) if need_validation else set()
        for asset_id, device_ids in asset_device_mappings.items():
            valid_device_ids = []
            for device_id in device_ids:
                device_type, did = device_id.split(':')
                if not need_validation or device_type in all_entity_types:
                    valid_device_ids.append(device_id)
                else:
                    self.logger.warning('ignore invalid device=%s in asset-device mappings' % device_id)
            asset_device_mappings[asset_id] = valid_device_ids
        self.logger.debug('asset_device_mappings=%s', asset_device_mappings)

        entity_type_meta = dict()

        # create a dict of entity types to their data items used
        features_meta = dict()
        for idx, item in enumerate(self.data_items):
            name_type, name = item.split(':')
            features_meta.setdefault(name_type, set()).add(name)
        self.logger.debug('features_meta=%s', features_meta)

        mappings_meta = set()
        for asset_id, device_ids in asset_device_mappings.items():
            for device_id in device_ids:
                entity_type, entity_id = device_id.split(':')
                mappings_meta.add(entity_type)
        self.logger.debug('mappings_meta=%s', mappings_meta)

        missing_feature_entity_types = set(features_meta.keys()) - mappings_meta - set([self.ASSET_TYPE])
        if len(missing_feature_entity_types) > 0:
            msg = 'cannot find any device of feature_entity_type=%s associated with assets of asset_group_id=%s, therefore there is no data for those features, please check if those asset-device mappings are set correctly, either from UI or by calling function pmlib.set_asset_device_mappings()' % (str(list(missing_feature_entity_types)), self.asset_group)
            self.logger.error(msg)
            raise RuntimeError(msg)

        for entity_type in (mappings_meta - set(features_meta.keys())):
            self.logger.warning('ignore unused entity_type=%s in asset_device_mappings, not used in data_items' % entity_type)

        for entity_type in (set(features_meta.keys()) & mappings_meta):
            entity_type_meta[entity_type] = list(features_meta[entity_type])
        if self.ASSET_TYPE in features_meta:
            entity_type_meta[self.ASSET_TYPE] = list(features_meta[self.ASSET_TYPE])

        return (asset_device_mappings, entity_type_meta)

    def _validate_data_items(self):
        if api.is_local_mode():
            self.logger.debug('local_mode=True, all_entity_types=set()')
            return
        #print([ent_type for ent_type in list(self.entity_type_metadata.items())])
        all_entity_types = set([ent_type[1].get('name','') for ent_type in list(self.entity_type_metadata.items()) if isinstance(ent_type[1],dict)])#set(self.entity_type_metadata.keys())
        
        self.logger.debug('Retrieved entity types from Monitor. all_entity_types=%s', all_entity_types)

        for idx, item in enumerate(self.data_items):
            name_type, name = item.split(':')
            
            if name_type == self.ASSET_TYPE:
                # asset data
                continue
            entity_type_instance = api.get_entity_from_metadata(self.entity_type_metadata, name_type)
            if entity_type_instance is None:
                raise ValueError('"%s" has invalid entity type (case-sensitive)' % item)

            #data_item_names = {dto['name'].lower() for dto in self.entity_type_metadata[api.get_entity_from_metadata(self.entity_type_metadata, device_type)['entityTypeId']]['dataItemDto']}
            data_item_names = {dto['name'].lower() for dto in entity_type_instance['dataItemDto']}
            if name.lower() not in data_item_names:
                self.logger.debug('name=%s not found in entity_type=%s data_items=%s', name, name_type, data_item_names)
                raise ValueError('"%s" is an invalid data item (case-sensitive)' % item)
