# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA, 5900-AMG
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""Base asset group model pipelines.
"""

__pdoc__ = {
    'AssetGroupPipeline.is_data_source': False,
    'AssetGroupPipeline.merge_method': False,
    'AssetGroupPipeline.set_asset_device_mappings': False,
    'SimpleCustomAssetGroupPipelineLoader.is_data_source': False,
    'SimpleCustomAssetGroupPipelineLoader.merge_method': False,
    'and_': False,
    'select': False,
}

import json
import logging
import os
import re
from collections import OrderedDict, defaultdict
from datetime import datetime
from typing import List

import dill
import pandas as pd
from pandas.tseries.offsets import DateOffset
from sqlalchemy import and_, func, select

from iotfunctions.base import BaseTransformer
from iotfunctions.db import Database
from iotfunctions.metadata import EntityType
from pmlib import monitor_api
from pmlib.api import get_entity_type_id

from . import api
from .cache_loader import AssetCacheRefresher
from .estimator import (BaseEstimator, WmlDeploymentEstimator,
                        WmlSPSSDeploymentEstimator)
from .loader import AggregatedAssetLoader, AssetLoader
from .persist import PersistColumns
from .transformer import MissingValueImputationTransformer
from .util import (_mkdirp, camel_to_snake, find_list_duplicate, get_as_schema,
                   get_logger, log_df_info, remove_list_duplicate,
                   setup_logging)

# TODO get frequency list from AS
DEFAULT_FREQUENCY = [
    {
        "name": "Daily",
        "description": "Daily summary",
        "alias": "D"
    },
    
    {
        "name": "Hourly",
        "description": "Hourly summary",
        "alias": "H"
    },
    {
        "name": "Minute",
        "description": "minute summary",
        "alias": "min"
    }
    
]

'''
old code
DEFAULT_FREQUENCY = [
    {
        "name": "Daily",
        "description": "Daily summary",
        "alias": "D"
    },
    {
        "name": "Monthly",
        "description": "Monthly summary",
        "alias": "MS"
    },
    {
        "name": "Weekly",
        "description": "Weekly summary",
        "alias": "W"
    },
    {
        "name": "Hourly",
        "description": "Hourly summary",
        "alias": "H"
    }
]
'''

# https://github.ibm.com/wiotp/Maximo-Asset-Monitor/issues/4766
# replace entityFirst with aggregateByDevice
DEFAULT_GRANULARITY = [
    {
        "name": "Daily",
        "tenant": "###_IBM_###",
        "description": "Daily summary",
        "frequency": "Daily",
        "dataItems": [],
        
        "aggregateByDevice": False,
        "sourceTableName": None
    }
]
# APM Predict custom granularities for all asset groups
# For MAS8.7 remove Weekly and Monthly
CUSTOM_GRANULARITY = [
    {
        "name": "Hourly",
        "description": "Hourly",
        "frequency": "Hourly",
        "dataItems": [],
        "entityFirst": True
    },
    {
        "name": "GroupHourly",
        "description": "GroupHourly",
        "frequency": "Hourly",
        "dataItems": [],
        "entityFirst": False
    },
    {
        "name": "Minute",
        "description": "Minute summary",
        "frequency": "Minute",
        "dataItems": [],
        "entityFirst": True
    },
    
    {
        "name": "GroupMinute",
        "description": "GroupMinute",
        "frequency": "Minute",
        "dataItems": [],
        "entityFirst": False
    }
]


DEPRECECATED_GRANULARITY = [
    {
        "name": "GroupDaily",
        "description": "GroupDaily",
        "frequency": "Daily",
        "dataItems": [],
        "entityFirst": False
    }
                            ]

'''
Old ccode
CUSTOM_GRANULARITY = [
    {
        "name": "Daily",
        "description": "Daily",
        "frequency": "Daily",
        "dataItems": [],
        "entityFirst": False
    },
    {
        "name": "Hourly",
        "description": "Hourly",
        "frequency": "Hourly",
        "dataItems": [],
        "entityFirst": True
    },
    {
        "name": "GroupHourly",
        "description": "GroupHourly",
        "frequency": "Hourly",
        "dataItems": [],
        "entityFirst": False
    }


    {
        "name": "Weekly",
        "description": "Weekly",
        "frequency": "Weekly",
        "dataItems": [],
        "entityFirst": True
    },
    {
        "name": "GroupWeekly",
        "description": "GroupWeekly",
        "frequency": "Weekly",
        "dataItems": [],
        "entityFirst": False
    },
    {
        "name": "Monthly",
        "description": "Monthly",
        "frequency": "Monthly",
        "dataItems": [],
        "entityFirst": True
    },
    {
        "name": "GroupMinute",
        "description": "GroupMinute",
        "frequency": "Minute",
        "dataItems": [],
        "entityFirst": False
    }
    

]

'''

# TODO: Health Check API Here
is_91_instance = api.monitor_health_check()
if (is_91_instance is True):
    GRANULARITY_NAMES = {
        camel_to_snake(g['name']):g['name'] for g in DEFAULT_GRANULARITY + CUSTOM_GRANULARITY
    }
else:
    GRANULARITY_NAMES = {
        camel_to_snake(g['name']):g['name'] for g in DEFAULT_GRANULARITY + CUSTOM_GRANULARITY + DEPRECECATED_GRANULARITY
    }
AGGREGATION_METHODS = {
    'count': 'Count',
    'first': 'First',
    'last': 'Last',
    'max': 'Maximum',
    'mean': 'Mean',
    'median': 'Median',
    'min': 'Minimum',
    'product': 'Product',
    'std': 'StandardDeviation',
    'sum': 'Sum',
    'var': 'Variance',
}


# setup proper default logging
setup_logging()


class _AssetGroupEntityType(EntityType):
    def __init__(self,
                 asset_group_id,
                 db,
                 **kwargs):
        super().__init__(('APM_%s' % asset_group_id).lower(), db, **kwargs)

    def get_data(self, start_ts=None, end_ts=None, entities=None, columns=None):
        return None


class AssetGroupPipeline(BaseTransformer):
    """Base class for asset group model pipeline.

    This class provides the fundation for model pipelines. It wraps and handles all the data loading, 
    including asset and IoT device data, and their merging/joining. It provides the capability to 
    train models from the loaded data, as well as registering the trained models to APM system so that 
    they can be used to generate predictions for the incoming new data continuously.

    Most of the pipeline configuration is by the parameter `model_pipeline`, which is a dict. The pipeline 
    configuraiton determines the inputs and outputs of the pipeline instance to be created. There are 3 keys 
    used by all sub-class pipelines: 
        
    * `features`: the input data names to the models for both training and scoring.
    * `features_for_training`: the input data names to the models, only for training. Usually these are 
    training labels/targets.
    * `predictions`: the output data names from the models.

    The value of these 3 keys are all list of strings, of which each element represents an attribute, 
    either of assets or IoT devices. The elements are specified in the format of '<type>:<attribute_name>', 
    of which <type> can be an empty syring '' representing assets' attributes, or a non-empty 
    string <type> being an IoT device type.

    There is another very useful parameter for resampling features, which can be used for feature extraction 
    or down sampling features. For example, for some modles, it is very useful to use various statistics 
    of input features instead of raw time-series data, like producing and using all of the min/max/mean/std 
    of a feature instead of the feature itself alone. Another example is for very frequent input, down sampling 
    to 1 event per minute might considerably reduce the workload generated while still preserving the model's 
    accuracy.

    * `features_resampled`: `dict`

        The keys of this `dict` are types used in `features`, as resampling is defined for each type separately. 
        For each type's resampling, a mandatory key `${freqency}` must be provided, specifying the resampling 
        frequency for this type, given as [Pandas offset alias](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
        The other keys of a type gives the actual features to be resampled, with values of yet another `dict` 
        of keys as aggregation methods and values as resampled feature names. Obviously, Aggregation methods currently 
        can be `{'max', 'min', 'mean', 'count', 'sum', 'std', 'first', and 'last'}`. 
        The resampled feature names can be any literal you want, as long as it does not conflict with other 
        metrics or data items within this asset group. A special syntax allows the use of a few special values 
        for the resampled feature name: `${method}` and `${data_item}`. By default, it uses `${data_item}_${method}` 
        if you use `None` for resampled feature name. Note that when only one aggregation method is used for 
        a feature and no resample feature name is given, then by default the original feature name is used, 
        essentially the resampled feature 'replaces' the original one.

    Here are a few resample examples:

    ```
    model_pipeline={
    'features': ['pump:axlevibration'],
    'features_resampled': {
        'pump': {
            '${freqency}': '5T',
            'axlevibration': {
                'max': None,
                'mean': None,
                'min': None,
            },
        },
    },

    ```

    This example ends in using 3 features: `pump:axlevibration_max`, `pump:axlevibration_mean`, and `pump:axlevibration_min`. 
    They are obviously the 5-second aggregation, using aggregation max/mean/min, respectively.

    Let's see another slightly different example:

    ```
    model_pipeline={
    'features': ['pumpA:axlevibration', 'pumpB:axlemomentum', 'pumpC:load'],
    'features_resampled': {
        'pumpA': {
            '${freqency}': '5T',
            'axlevibration': {
                'max': None,
                'mean': None,
                'min': None,
            },
        },
        'pumpC': {
            '${freqency}': '15T',
            'axlevibration': {
                'mean': None,
            },
        },
    },

    ```

    This example ends in using 5 features: `pumpA:axlevibration_max`, `pumpA:axlevibration_mean`, `pumpA:axlevibration_min`, 
    `pumpB:axlemomentum`, and `pumpC:load`. The first 3 are the same as the previous example. The 4th is the original, 
    non-resampled one. The last one is the 15-second mean of `load` in type `pumpC` (but with same name as the original).

    Note that resampling can also be used for `features_for_training`, not jsut `features`. Also, currently only IOT types are 
    supported, but not assets.

    Another common pipeline configuration used is the key `srom_training_options` of the dict 
    parameter `model_pipeline`. It provides an easy way to pass parameters to SROM training pipeline's 
    `execute` method. Here are the options available:

    * `exectype`: {'single_node_complete_search', 'single_node_random_search', 'spark_node_complete_search', 'spark_node_random_search'}

        'spark': executes the pipeline on a Spark cluster, 
        'single_node' or 'single_node_complete_search': executes the pipeline for all parameter samples on single node, 
        'single_node_random_search': executes the pipeline for random parameter samples on single node, 
        'spark_node_complete_search': executes the pipeline for all parameter samples on spark, 
        'spark_node_random_search': executes the pipeline for random parameter samples on spark.

    * `verbosity`: {'low', 'medium', 'high'}

    * `n_jobs`: int

        Number of parallel jobs, applicable to non-Spark only.

    * `pre_dispatch`: int

        Number of jobs that get dispatched during parallel execution.

    * `num_option_per_pipeline`: int

        Number of parameter settings that are sampled, applicable to random-search mode only.

    * `max_eval_time_minute`: int

        Maximum timeout for execution of pipelines with unique parameter grid combination, applicable to Spark only.

    If you don't specify any of these options, SROM defaults are used. Please read SROM library documentation for details on these.

    The default data cleansing strategy uses:

    * `fillna`: ffill
    * `dropna`: any
    * `fillna_exclude`: all items in `features_for_training`
    * `dropna_exclude`: all items in `features_for_training`

    This strategy works well with most cases. Because data could be from multiple sources (sensors) joined 
    together on timestamp, filling missing value with last obsevered one works fine in most cases. There's 
    also optional `fillna_exclude` to exclude specific features from this strategy. On the other hand, most 
    models expect all features to have values, so dropping rows with any feature value missing works well. 
    Similarly, `dropna_exclude` is also available to exclude specific features from dropping rows with missing 
    values.

    Note that by default, all items in `features_for_training` are automatically added to both `fillna_exclude` 
    and `dropna_exclude`. See `pmlib.pipeline.AssetGroupPipeline.prepare_fillna_exclude` and 
    `pmlib.pipeline.AssetGroupPipeline.prepare_dropna_exclude` for details.

    All pipelines must inherit from this class.

    Parameters
    ----------
    asset_group_id: `str`
        The asset group ID this pipeline belongs to.
    model_pipeline: `dict`
        Model pipeline configuration. See above for details. This parameter is required, but can be omitted 
        when `model_instance_id` is given in which case this is loaded from the previous trained model 
        instance.
    model_instance_id: `str`, optional
        Create a pipeline object from an existing model instance's ID, essentially recreating the model 
        pipeline trained and registered before. This is for loading previously trained model instance 
        as a pipeline object in notebook for scoring and analysis purpose. When this parameter is given, 
        all other parameters are ignored. Default is None.
    summary : `dict`, optional
        Custom summary outputs to be generatd. Scoring results are generatd for each incoming data. 
        Summary outputs can be generated on top of scoring results, for example, hourly average or 
        daily maximum. Each derived model pipeline class could have its own default summary outputs. 
        If this parameter is not given, then default summary outputs are used; otherwise, this custom 
        summary output overrides the default ones. See `pmlib.pipeline.AssetGroupPipeline.default_summary` 
        for details on how to use it.
    incremental_summary : `bool`, optional
        Summary outputs are for some time window, say hourly, daily, or monthly. The latest time window's 
        *accurate* summary outputs cannot be generated until time has passed the end of the window. In production 
        deployments, this parameter should be left to False to ensure predictions are as accurate as possible. 
        However, if you are testing and want to see results right away, you can change this parameter to true.
    published_outputs : `dict`, optional
        Custom odel pipeline outputs to be published to Maximo Health. To make model pipeline 
        outputs available in Maximo Health, for example, to be used in conditional monitoring or scoring 
        formula, they must be published. If this parameter is not given, then default summary outputs are 
        used; otherwise, this custom summary output overrides the default ones. 
        See `pmlib.pipeline.AssetGroupPipeline.default_published_outputs` for details on how to use it.
    fillna: {'backfill', 'bfill', 'pad', 'ffill', None}, optional
        Fill NA values using the specified method. 'pad' / 'ffill': propagate last valid observation 
        forward to next valid, 'backfill' / 'bfill': use next valid observation to fill gap. Default is 'ffill', 
        that is, use last valid observation of a feature for any missing value.
    fillna_exclude: `list` of `str`, optional
        When filling NA values, the features to exclude from filling NA values. Default is None, that is, all 
        features' NA values are to be filled.
    dropna: {'any', 'all'}, optional
        Remove missing values. 'any': if any NA values are present in any column, drop that row. 'all': if all 
        values are NA, drop that row. Default is 'any', that is, drop a row if any feature is missing value.
    dropna_exclude: `list` of `str`, optional
        When dropping NA values, the features to exclude from checking NA values. Default is None, that is, all 
        features are checked if having NA values.
    model_template_name: `str`, optional
        The display name of this model pipeline to be registered. Derived classes should give their desired 
        names via this parameter. If not given, the model pipeline's class name is used instead. Default it None, 
        which uses the class name. Note that this is separate from the model template ID which is always using 
        the class name. Model template ID must be unique in a tenant, thought model template name is not 
        required to be unique.
    model_template_desc: `str`, optional
        The model template's description to be registered for the model. Derived classes should give their desired 
        one via this parameter. If not given, default is None, in which case set to be the same as `model_template_name`.
    pre_trained_model=None, optional
        The pre_trained_model to be SROM SmartClassification, SmartRegression or SromPipeline
    """

    is_data_source = True
    """Asset data loader is a data source in AS.

    Data source functions are executed at the beginning stage of AS pipelines.
    """

    merge_method = 'replace'
    """Asset data loader is primary data source which always replaces upstream dataframe with its own.

    Using replace merge method means our asset data loader is the one responsible for what should be the 
    dataframe passed to downstream. Internally, our asset data loader does all the necessary merging and 
    joining, between the input upstream dataframe and loaded asset data. This merge method makes sure 
    AS does not intervene with this logic.
    """

    @classmethod
    def get_model_instances(cls, asset_group_id):
        """Get a list of all model instances of the given asset group and also of this asset group pipeline 
        class model template.

        This is a class method and it returns content like the following:

        ```
        [
            {
                'modelInstanceId': '94FBC0D0-A60E-40A5-93C2-ADF614771268',
                'modelInstanceName': '1010_AnomalyDetectionAssetGroupPipeline_2019-12-16T13:25:28.455520',
                'modelInstanceDesc': '1010_AnomalyDetectionAssetGroupPipeline_2019-12-16T13:25:28.455520',
                'enabled': True,
            },
        ]
        ```

        Parameters
        ----------
        asset_group_id: `str`
            The asset group ID to which the returned model instances belong.

        Returns
        -------
        `list` of `dict`, of which each `dict` representing a model instance. The `dict` contains at least 
        'modelInstanceId', 'modelInstanceName', 'modelInstanceDesc', and 'enabled'.
        """

        resp = api.get_model_instance(asset_group_id=asset_group_id)
        if resp is not None:
            resp = resp.json()

            #resp is {}
            modelInstanceList = resp.get('modelInstanceList',None)
            if modelInstanceList is None:
                return None

            for template_instances in resp['modelInstanceList']:
                if template_instances['modelTemplateId'] == cls.__name__:
                    return template_instances['instanceList']

        return None 

    def __init__(self, asset_group_id=None, model_pipeline=None,
                 model_instance_id=None,
                 summary=None, incremental_summary=False,
                 published_outputs=None,
                 fillna='ffill', fillna_exclude=None,
                 dropna='any', dropna_exclude=None,
                 model_timestamp=None,
                 local_model=True,
                 model_template_name=None,
                 model_template_desc=None,
                 pre_trained_model=None,
                 **kwargs):
        super().__init__()

        self.kwargs = kwargs

        self.pre_trained_model=pre_trained_model

        self.df_traces = dict()
        self.logger = get_logger(self)
        self.debug = kwargs.get('debug', False) # for controlling iotfunction's to_csv
        self.training_timestamp = None
        self.new_training = False
        self.new_training_output_df = None

        self.logger.info('Initializing %s Asset Group Pipeline...', model_template_name if model_template_name is not None else 'Unspecified')

        # set user-friendly model name (not id), by default, if not given, set to class name same as model template id
        if model_template_name is not None:
            self.model_template_name = model_template_name
        else:
            self.model_template_name = self.__class__.__name__
        if model_template_desc is not None:
            self.model_template_desc = model_template_desc
        else:
            self.model_template_desc = self.model_template_name

        self.asset_group_id = asset_group_id
        if self.asset_group_id is None:
            raise ValueError('parameter asset_group_id cannot be None')
        self.asset_group_id = str(self.asset_group_id)

        # obsolete-for-end-user, but still useful for unit-testing, so hidden 
        # from normal parameter
        asset_device_mappings = kwargs.get('asset_device_mappings', None)
        self.data_substitution = kwargs.get('data_substitution', None)
        db = kwargs.get('db', None)
        db_schema = kwargs.get('db_schema', None)

        setattr(self,'event_timestamp_column_name', kwargs.get('event_timestamp_column_name','evt_timestamp'))
        

        granularity = kwargs.get('granularity', []) # TODO add validation of the given granularity

        # making sure environment variables necessary for PMI are all set
        self.apm_id, self.apm_api_baseurl, self.apm_api_key = api.init_environ()
        db_schema = get_as_schema(db_schema)

        self.model_instance_id = model_instance_id
        self.loaded_model_instance = None
        if self.model_instance_id is not None:
            self.logger.info('User passed in a model instance ID- initializing pipeline based on this model...')
            # TODO get model instance and set parameters accordingly
            resp = api.get_model_instance(asset_group_id, self.model_instance_id)
            if resp is None:
                raise ValueError('error finding model_instance_id=%s, please check if it is a valid one' % self.model_instance_id)

            # keep it so we can register again to update something (modelKpi must not be touched)
            self.loaded_model_instance = resp.json()
            self.loaded_model_instance['modelCosPath'] = json.loads(self.loaded_model_instance['modelCosPath'])
            self.loaded_model_instance['modelKpi'] = json.loads(self.loaded_model_instance['modelKpi'])
            self.loaded_model_instance['modelPostProcess'] = json.loads(self.loaded_model_instance['modelPostProcess'])
            # {
            #     'modelCosPath': ...
            #     'modelKpi': ...
            #     'modelGranularity': ...
            #     'modelPostProcess': ...
            #     'modelInstanceName': ...
            #     'predictWindow': ...
            #     'modelOutput': ...
            #     'modelInstanceDesc': ...
            #     'modelInstanceId': ...
            #     'modelTemplateId': ...
            #     'moodelInput': ...
            #     'kpiResponse': ...
            #     'status': ...
            # }

            # extract most of the useful stuff from 'modelInput' to restore the 'state' of this object
            model_input = json.loads(self.loaded_model_instance['moodelInput'])
            model_pipeline = model_input.get('model_pipeline', None)
            incremental_summary = model_input.get('incremental_summary', True)
            fillna = model_input.get('fillna', None)
            fillna_exclude = model_input.get('fillna_exclude', None)
            dropna = model_input.get('dropna', None)
            dropna_exclude = model_input.get('dropna_exclude', None)
            asset_device_mappings = model_input.get('asset_device_mappings', None)
            model_timestamp = model_input.get('model_timestamp', None)
            local_model = model_input.get('local_model', None)

        if model_pipeline is None:
            raise ValueError('parameter model_pipeline cannot be None')

        self.model_pipeline = model_pipeline
        self.incremental_summary = incremental_summary
        self.model_timestamp = model_timestamp
        self.training = (self.model_timestamp is None) #and (self.pre_trained_model is None)
        self.local_model = local_model

        self.granularity = CUSTOM_GRANULARITY.copy()
        self.granularity.extend(granularity)

        # initialize Monitor database
        if db is None:
            db: Database = api._get_db(asset_group_id=self.asset_group_id)
        self.db = db
        self.db_schema = db_schema
        
        # only set entity type ID if retrieving an existing model
        if self.model_instance_id is not None and self.db.model_store.entity_type_id is not None:
            db.model_store.entity_type_id = api.get_entity_type_id_by_entity_type_name(asset_group_id)

        self.entity_type_metadata = self.db.entity_type_metadata.copy()

        # For Monitor 2.0, we use _timestamp_col
        self._entity_type = _AssetGroupEntityType(
            self.asset_group_id,
            self.db,
            **{
                '_timestamp_col': self.event_timestamp_column_name,
                '_db_schema': self.db_schema,
                'logical_name': self.asset_group_id,
                '_entity_type_id': self.db.entity_type_metadata[self.asset_group_id].get('entityTypeId', None) if self.asset_group_id in self.db.entity_type_metadata else None,
            })

        self.pipeline_config = self._get_pipeline_config()

        # used by method register() later, sent to AS
        self.config = {
            'asset_group_id': self.asset_group_id,
            'model_pipeline': self.model_pipeline.copy(),
            'incremental_summary': incremental_summary,
            'fillna': fillna,
            'fillna_exclude': fillna_exclude,
            'dropna': dropna,
            'dropna_exclude': dropna_exclude,
            'local_model': False, # on AS it is always False
        }

        # remove notebook training only config from sending to AS
        if 'override_training_stages' in model_pipeline:
            del self.config['model_pipeline']['override_training_stages']

        if asset_device_mappings is not None and isinstance(asset_device_mappings, dict) and len(asset_device_mappings) > 0:
            # if a static asset_device_mappings is given explicity, which can only happen when in notebook, 
            # pass on to self.config (which is only used in register). this is a special condition when 
            # static mappings are set by users (ex. sample data). normally, mappings are dynamically queried.
            self.config['asset_device_mappings'] = {asset.upper():devices for asset, devices in asset_device_mappings.items()}
            self.logger.warning('parameter asset_device_mappings is obsolete, please use method pmlib.set_asset_device_mappings() instead')

        self.logger.debug('Pipeline configuration: %s', self.config)

        # validate data substitution, do it in constructor right away instead of until execute() is run
        # loader created here is used only for validation then discarded. a new loader is always created 
        # each time execute() runs
        loader = AssetLoader(asset_group=self.asset_group_id, 
                             _entity_type=self._entity_type, 
                             data_items=list(self.pipeline_config['inputs']), 
                             names=list(self.pipeline_config['renamed_inputs']), 
                             resamples=dict(), 
                             entity_type_metadata=self.db.entity_type_metadata, 
                             asset_device_mappings=self.config.get('asset_device_mappings', None))
        if self.data_substitution is not None:
            for entity_type in self.data_substitution:
                loader.set_data_substitution(entity_type, self.data_substitution[entity_type], errors='raise')

        # we don't send the original 'summary' to AS but instead sending 'self.post_processng'
        # so if loading instance back, we need to get it from loaded model instance instead of regenerating 
        # it from summary
        if self.loaded_model_instance is not None:
            self.post_processing, self.published_outputs = self.loaded_model_instance['modelPostProcess'], self.loaded_model_instance.get('modelPublishedOutputs', dict())
        else:
            self.post_processing, self.published_outputs = self._prepare_post_processing(summary, published_outputs, self.pipeline_config)
        for p in self.post_processing:
            if (p["granularity"].lower() == "groupdaily"):
                print("in if")
                p["granularity"] = "Daily"
        self.logger.debug('Retrieved post processing configuration: post_processing=%s', self.post_processing)
        self.logger.debug('Incremental summary enabled: incremental_summary=%s', self.incremental_summary)
        self.logger.debug('Retrieved published outputs: published_outputs=%s', self.published_outputs)

        # this needs to be after post_processing handling since it uses that info
        self._validate_pipeline_outputs()

        self.logger.info('Finished initializing Asset Group Pipeline.')

    def _get_pipeline_config(self):
        model_config = _ModelPipelineConfig(db=self.db, entity_type_metadata=self.entity_type_metadata, training=self.training, **self.model_pipeline)

        # give a chance for sub classes to 'touch' model_pipeline_config in whatever way they like
        self.prepare_model_config(model_config)
        self.logger.debug('Subclass\'s prepared_model_config: %s', model_config)
        
        model_config['latest_prediction_timestamp'] = self._get_latest_prediction_timestamp(model_config['predictions'])

        return model_config

    def _validate_pipeline_outputs(self):
        """
        Checks pipeline outputs to ensure they are not duplicates of any other 
        pipeline outputs for models assigned to this asset group.
        Raises an exception if duplicates are found.
        """

        outputs = self.pipeline_config['predictions'] + [post['output']['name'] for post in self.post_processing]
        self.logger.debug('Validating pipeline outputs: current_outputs=%s', outputs)

        # check if outputs themselves have duplicates
        dups = []
        if len(set(outputs)) < len(outputs):
            seen = set()
            for output in outputs:
                if output not in seen:
                    seen.add(output)
                else:
                    dups.append(output)
        if len(dups) > 0:
            raise ValueError('check if outputs themselves have duplicates, invalid model_pipeline.predictions having duplicated_names=%s, please use different names' % list(set(dups)))

        outputs = set(outputs)

        # validate outputs not duplicated
        # only when in non-local model or with a freshly created pipeline object (not scoring) do we need to check
        if not api.is_local_mode() and self.model_timestamp is None:
            dups = []
            dup_model_instances = []

            # validate against PMI model metadata
            model_instances = self.__class__.get_model_instances(asset_group_id=self._entity_type.logical_name)
            if model_instances is not None:
                for model_instance in model_instances:
                    model_instance_id = model_instance['modelInstanceId']
                    self.logger.debug('Checking for duplicates on model model_instance_id=%s', model_instance_id)

                    one_model_instance = api.get_model_instance(asset_group_id=self._entity_type.logical_name, model_instance_id=model_instance_id)
                    if one_model_instance is None:
                        continue

                    one_model_instance = one_model_instance.json()

                    model_instance_inputs = json.loads(one_model_instance['moodelInput'])['model_pipeline']
                    self.logger.debug('model_instance_id=%s, model_instance_inputs=%s, current_model_pipeline=%s, equal=%s', 
                                      model_instance_id, 
                                      model_instance_inputs, 
                                      self.config['model_pipeline'], 
                                      model_instance_inputs == self.config['model_pipeline'])

                    kpis = json.loads(one_model_instance['modelKpi'])['output']['names']
                    agg_kpis = [post['output']['name'] for post in json.loads(one_model_instance['modelPostProcess'])]
                    model_instance_outputs = set(kpis + agg_kpis)
                    self.logger.debug('model_instance_outputs=%s, kpis=%s, agg_kpis=%s', model_instance_outputs, kpis, agg_kpis)

                    if model_instance_inputs == self.config['model_pipeline']:
                        # find the model instance for this pipeline, remove the outputs from the to-be-checked list
                        outputs = outputs - model_instance_outputs
                    elif len(outputs & model_instance_outputs) > 0:
                        # not the same model instance but has same output(s)
                        dups.extend(outputs & model_instance_outputs)
                        dup_model_instances.append(model_instance_id)
            if self._entity_type.logical_name in self.entity_type_metadata:
                data_item_names = {dto['name'].lower() for dto in self.entity_type_metadata[self._entity_type.logical_name]['dataItemDto']}
                dups = [output for output in outputs if output in data_item_names]

            # when a new asset group pipeline is to be created, there's no metadata on AS server yet but 
            # there cannot be duplicated in anyway

            if len(dups) > 0:
                raise ValueError('invalid model_pipeline.predictions having duplicated_names=%s with other_model_instances=%s or with existing data items, please use different names' % (list(set(dups)), dup_model_instances))
            self.logger.debug('Finished validating pipeline outputs. No issus detected.')

    def prepare_fillna_exclude(self, fillna_exclude, model_config):
        """Prepare what should be excluded from fillna stategy.

        The default behavior is to have all features for training to be excluded.
        """

        if fillna_exclude is None:
            fillna_exclude = []
        if len(model_config['features_for_training']) > 0:
            fillna_exclude.extend(model_config['features_for_training'])
        return list(set(fillna_exclude))

    def prepare_dropna_exclude(self, dropna_exclude, model_config):
        """Prepare what should be excluded from dropna stategy.

        The default behavior is to have all features for training to be excluded.
        """

        if dropna_exclude is None:
            dropna_exclude = []
        if len(model_config['features_for_training']) > 0:
            dropna_exclude.extend(model_config['features_for_training'])
        return list(set(dropna_exclude))

    def prepare_model_config(self, model_config):
        """Prepare the model pipeline's config.

        This method is where derived pipeline classes to prepare or initialize their model pipeline 
        configuration. Model pipeline configuration are given by the parameter `model_pipeline` to 
        the constructor as a dict.

        Typically, you validate given model pipeline configurations and/or set default values for those 
        missing ones in this method. By validation, you can make sure it fail fast when creating 
        a asset group pipeline.

        Parameters
        ----------
        model_config : `dict`
            A dict containing the configuration for the model pipeline, from the content of 
            contructor's parameter `model_pipeline`. To modify the model pipeline's configuration, 
            you modify this dict directly.
        """

        pass

    def _prepare_post_processing(self, summary, published_outputs, model_config):
        """Transform the summary input to be post processing AS aggregation function declarations.
        """

        merged_post_processing = list()
        summary_outputs = set()
        merged_published_outputs = dict()

        if summary is None:
            summary = self.default_summary(model_config)

        p = re.compile(r'\${(features|features_for_training|predictions)\[(\d+)\]}', re.I)
        for output, output_summaries in summary.items():
            m = p.match(output)
            if m:
                try:
                    output = model_config[m.group(1)][int(m.group(2))]
                except IndexError:
                    raise ValueError('invalid summary_data_item=%s, index out of range' % output)
            else:
                # no match, still allow any name user specified, it will be AS to validate if duplicate outputs
                # the fact is we can't validate before summary kpis are crated on AS
                pass
            self.logger.debug('Processing output_summaries: %s', output_summaries)
            for grain_name, agg_methods in output_summaries.items():
                if grain_name not in GRANULARITY_NAMES and grain_name not in DEPRECECATED_GRANULARITY:
                    raise ValueError('invalid summary_granularity=%s for summary_data_item=%s, must be one of %s or %s' % (grain_name, output, list(GRANULARITY_NAMES.keys()), list(DEPRECECATED_GRANULARITY.keys())))

                for agg, summary_output in agg_methods.items():
                    if agg not in AGGREGATION_METHODS:
                        raise ValueError('invalid summary_method=%s for summary_data_item=%s and summary_granularity=%s, must be one of %s' % (agg, output, grain_name, list(AGGREGATION_METHODS.keys())))

                    health_name = None
                    if summary_output is None:
                        summary_output = '${granularity}_${data_item}_${method}'
                    elif isinstance(summary_output, tuple) or isinstance(summary_output, list):
                        summary_output, health_name = summary_output

                    summary_output = re.sub(r'(\${(granularity)})', grain_name, summary_output, flags=re.I)
                    summary_output = re.sub(r'(\${(method)})', agg, summary_output, flags=re.I)
                    summary_output = re.sub(r'(\${(data_item)})', output, summary_output, flags=re.I)

                    if summary_output in summary_outputs:
                        raise ValueError('duplicate summary_output=%s not allowed' % summary_output)

                    summary_outputs.add(summary_output)

                    if health_name is not None:
                        health_name = re.sub(r'(\${(granularity)})', grain_name, health_name, flags=re.I)
                        health_name = re.sub(r'(\${(method)})', agg, health_name, flags=re.I)
                        health_name = re.sub(r'(\${(data_item)})', output, health_name, flags=re.I)
                        merged_published_outputs[summary_output] = health_name

                    merged_post_processing.append({
                        'functionName': AGGREGATION_METHODS[agg],
                        'enabled': True,
                        'granularity': GRANULARITY_NAMES[grain_name],
                        'output': {
                            'name': summary_output,
                        },
                        'input': {
                            'source': output,
                        }
                    })

        # if override by parameter, that's the only one to be used, otherwise, 
        # combine the default one and those specified with summary

        override = published_outputs is not None
        published_outputs = self._prepare_published_outputs(published_outputs, merged_post_processing, model_config)

        if override:
            merged_published_outputs = published_outputs
        else:
            merged_published_outputs.update(published_outputs)

        # we cannot easily validate Health side name uniqueness here, but at least we can 
        # find if there's any duplicates within this model pipeline
        renamed_outputs = list(merged_published_outputs.values())
        dup_renamed_outputs = find_list_duplicate(renamed_outputs)
        if len(dup_renamed_outputs) > 0:
            raise ValueError('duplicated Health side names=%s in merged_published_outputs=%s' % (dup_renamed_outputs, merged_published_outputs))

        return (merged_post_processing, merged_published_outputs)

    def default_summary(self, model_config):
        """The default prediction output summary to be generated.

        This method is where derived model pipeline classes to override to give its default prediction summary 
        to be generated.

        Here's the example taken from `pmlib.anomaly_detection.AnomalyDetectionAssetGroupPipeline.default_summary`:

        ```
        return {
            '${predictions[0]}': {
                'daily': {
                    'max': ('${granularity}_${data_item}', '${data_item}'),
                }
            }
        }
        ```

        This example generates one summary output for the first prediction output, of which is a daily summary using 
        aggregation method maximum, and with a name like `daily_anomaly_score` assuming `anomaly_score` is the 
        original prediction output to generate summary.

        Note that a tuple of two element is used for name value here. When the name value is given as a tuple, the 
        first element of the tuple is the output name while the second element of the tuple is the name to be 
        published and used in Health.

        Here's another example from `pmlib.failure_prediction.FailurePredictionAssetGroupPipeline.default_summary`.

        ```
        return {
            '${predictions[0]}': {
                'daily': {
                    'max': ('${granularity}_${data_item}', '${data_item}'),
                },
                'group_daily': {
                    'mean': '${granularity}_${data_item}',
                },
            }
        }
        ```

        This example generates two summary outputs, one for daily maximum failure probability and the other for 
        the whole asset group's daily average failure probability.

        Note that in the example, only the `daily` one is published but not the `group_daily` one (which is given 
        as a string directly instead of a tuple).

        Let's see more details on how to write the summary specification.

        The returned `dict` is keyed by data item names to generate summary. You can use any data item available 
        within this asset group, either raw metrics or derived data items. But most of the time, you would use a 
        special syntax to reference the pipeline's configured input features or output prediction names. The 
        special syntax is like `${predictions[0]}` or `${features[1]}` or `${features_for_training[0]}`.

        Each first level value in the `dict` is another dict keyed by granularity. You can use any of these: `hourly`, 
        `daily`, `weekly`, `monthly`, `group_hourly`, `group_daily`, `group_weekly`, and `group_monthly`.

        Each second level value (for some granularity) in the `dict` is yet another `dict`, keyed by summary method with 
        value of summary output name to use. Summary methods can be any of these: `count`, `first`, `last`, `max`, 
        `mean`, `median`, `min`, `product`, `std`, `sum`, and `var`.

        The summary output name can be any literal you want, as long as it does not conflict with other metrics or 
        data items within this asset group. A special syntax allows the use of a few special values for the summary 
        output name: `${granularity}`, `${method}`, and `${data_item}`. By default, it uses 
        `${granularity}_${data_item}_${method}` if you use `None` for summary output name.

        Parameters
        ----------
        model_config : `dict`
            A dict containing the content of contructor's parameter `model_pipeline`.

        Returns
        -------
        `dict` of summary data to be generated.
        """
        return {}

    def _prepare_published_outputs(self, published_outputs, post_processing, model_config):
        """Transform the given published_outputs with name substitution.
        """

        prepared_published_outputs = dict()

        if published_outputs is None:
            published_outputs = self.default_published_outputs(model_config)

        self.logger.debug('Preparing published outputs. Input published_outputs=%s', published_outputs)

        p = r'\${(features|features_for_training|predictions)\[(\d+)\]}'
        repl = lambda m: model_config[m.group(1)][int(m.group(2))]
        for name, published_name in published_outputs.items():
            name = re.sub(p, repl, name, flags=re.I)
            published_name = re.sub(p, repl, published_name, flags=re.I)

            prepared_published_outputs[name] = published_name


        invalid_outputs = set()
        all_outputs = model_config['predictions'] + [kpi['output']['name'] for kpi in post_processing]
        for name in prepared_published_outputs:
            if name not in all_outputs:
                invalid_outputs.add(name)
        if len(invalid_outputs) > 0:
            raise ValueError('invalid_outputs=%s in prepared_published_outputs=%s' % (invalid_outputs, prepared_published_outputs))

        self.logger.debug('Finished preparing published outputs: %s', prepared_published_outputs)
        return prepared_published_outputs

    def default_published_outputs(self, model_config):
        """The default prediction outputs to be published to Health side.

        This method is where derived model pipeline classes to override to give its default prediction outputs 
        to be published to Health side. Note that for summary outputs, they are usually specified in method 
        `pmlib.pipeline.AssetGroupPipeline.default_summary` which is also easier. This method is mostly used 
        for marking non-summary level outputs to be published.

        Here's an example:

        ```
        return {
            'daily_${predictions[0]}': '${predictions[0]}',
        }
        ```

        This example publishes the daily summary output from the first prediction output using the original 
        prediction output name to Health. Essentially, the daily summary is treated as the predicted 
        value for the asset in Health.

        The returned `dict` is keyed by output names, including any raw or summary outputs from this model pipeline, 
        with Health side names as the `dict` values. You can use a special syntax to reference the pipeline's 
        configured input features or output names. The special syntax is like `${predictions[0]}` or 
        `${features[1]}` or `${features_for_training[0]}`.

        Obviously, the key part must be a valid output names from this model pipeline, or a `ValueError` is 
        raised when this pipeline object is created. Also, the value part must not have duplicated otherwise 
        a `ValueError` is also raised.

        Parameters
        ----------
        model_config : `dict`
            A dict containing the content of contructor's parameter `model_pipeline`.

        Returns
        -------
        `dict` of data items as keys and Health side names as values to be published.
        """

        return {}

    def prepare_execute(self, pipeline, model_config):
        """Prepare the pipeline based on the given model pipeline configuration.

        This method is the main place for derived pipeline classes to override to setup their 
        pipeline stages. The given `pipeline` object provides `iotfunctions.pipeline.CalcPipeline.add_stage` 
        for adding pipeline stages. Each stage is essentially a class with `execute(df, start_ts, end_ts)` 
        method which takes an input upstream dataframe and returns a result dataframe for downstream.

        The given `model_config` is a dict containing the constructor's parameter `model_pipeline`.

        In most cases, you would add one or more transformers for preprocessing and/or feature extraction, 
        then followed by an estimator (derived classe of `pmlib.estimator.SromEstimator` or `pmlib.estimator.BaseEstimator`).

        Here's the simplest example of adding just an estimaotr (taken from `pmlib.degradation_curve.DegradationCurveAssetGroupPipeline`):

        ```
        pipeline.add_stage(DegradationCurveEstimator(**model_config))
        ```

        Here's another more complex example (taken from `pmlib.anomaly_detection.AnomalyDetectionAssetGroupPipeline`):

        ```
        estimator = AnomalyDetectionEstimator(**model_config)
        pipeline.add_stage(estimator)
        if len(model_config['features_for_training']) > 0:
            estimator.add_training_preprocessor(TransformNotNaToEvent(model_config['features_for_training'][0]))
            estimator.add_training_preprocessor(IdentifyPreFailureWindow(model_config['features_for_training'][0], pre_failure_window_size=model_config.get('pre_failure_window_size', 20), pre_failure_failure_size=model_config.get('pre_failure_failure_size', 10)))
        ```

        In this example, we have added two preprocessing for "training" only.
        
        By default this method adds Missing Value analysis and AutoImputation transformer as a common capability. As shown below this is driven by configuration settings. 
        If this is the desired behaviour, then make sure to call super().prepare_execute() from the derived class implementation. If not and if a different behaviour is 
        sought then override, but avoid calling super().prepare_execute().

        Parameters
        ----------
        pipeline : `iotfunctions.pipeline.CalcPipeline`
            A pipeline object for adding the stages to.
        model_config : `dict`
            A dict containing the content of contructor's parameter `model_pipeline`.
        """

        if self.training:
            self.logger.info('Appears to be training task, so checking for MissingValueAnalysis and AutoImputation config')

            if model_config.get('missing_value_analysis', True):
                self.logger.info('Found MissingValueImputation- adding it to the pipeline')
                missing_value_thresholds = model_config.get('missing_value_thresholds', {})
                self.logger.debug('missing_value_thresholds= %s', missing_value_thresholds)
                stop_if_missing_values_exceed_threshold = model_config.get('stop_if_missing_values_exceed_threshold',False)
                self.logger.debug('Stop the pipeline if missing values exceed the threshold: %s', stop_if_missing_values_exceed_threshold)
                auto_imputation_config = model_config.get('auto_imputation_config',None)
                mva_ai_args = {}
                mva_ai_args['auto_imputation_config'] = auto_imputation_config
                self.logger.debug('auto imputation configuration = %s', auto_imputation_config)
                pipeline.add_stage(MissingValueImputationTransformer(
                                    included_cols=[],
                                    excluded_cols=['faildate', 'installdate', 'failurecode', 'problemcode','causecode','remedycode'],
                                    missing_thresholds=missing_value_thresholds,
                                    stop_if_missing_values_exceed_threshold = stop_if_missing_values_exceed_threshold,
                                    **mva_ai_args)
                                  )
            else:
                self.logger.info("Skipping missing value analysis and auto imputation as they are not configured. If this is needed, enable 'missing_value_analysis' parameter and restart the training")


    def get_prediction_backtrack(self, model_config, incremental=True, n=1):
        """This method returns the backtrack window to look backward while loading data.

        When loading data to generate prediction results and prediction summary outputs, if there's a 
        need of history data, in addition to the latest data for the current batch, we must know how 
        far back in the history to fetch the data. This method provides the hook for make that 
        decidsion.

        The default implementation is to deduce the backtrack window by the summary outputs 
        required. Let's say if a daily summary output is to be generated, the default implementation 
        goes back to the start of the previous calendar day to fetch data from there. If both daily 
        and montly summary outputs are required, then it goes back to the start of the previous month 
        to fetch data. This is for the starting point of the time range to fetch data.

        As for the ending point of the time range to fetch data, it is controlled by parameter 
        `incremental`. If it is True, then the end point is up to current time, i.e. the time 
        current scoring happens. If it is False, then the end point is the ending time of the 
        summary unit. For example, for daily, it is today's midnight, and for monthly, it's the 
        last day of previous month.

        This method returns a two-element `list` (of `list` of `pandas.tseries.offsets.DateOffset`) of 
        whiech the first is for the starting point of the time range to fetch data while the second is 
        for the ending point.

        The way the returned values used is that they are substracted from the current time (scoring 
        time), one by one, to construct the starting/endding time to load data.

        Here's an example of always loading the complete 24H data of the previous calendar-day:

        ```
        reset = pandas.tseries.offsets.DateOffset(**{"hour": 0, "minute": 0, "second": 0, "microsecond": 0})
        offset = pandas.tseries.frequencies.to_offset('1d')
        return [[reset, offset], []]
        ```

        Here, the starting part (first element) contains 2 elements, the first one is used to reset 
        currrent time to midnight, then the next is used to move to previous day. The result is to use 
        yesterday's midnight as the starting point to load data. The ending part is an empty list, 
        which results in using current time as the ending point to load data.


        Parameters
        ----------
        model_config : `dict`
            A dict containing the content of contructor's parameter `model_pipeline`.
        incremental : `bool`, optional
            Whether to generate incremental summary outpus before the time winodow is fully passed. Default is True.
        n : `int`, optional
            The number of time periods the offset represents. Default is 1.

        Returns
        -------
        `list` of `list` of `pandas.tseries.offsets.DateOffset`
            A 2-element list, of which the first element is for the starting time (inclusive) 
            while the second element is for the ending time (inclusive). Each element is a list of 
            `pandas.tseries.offsets.DateOffset`, of which can contain multiple DateOffset.
        """

        if self.post_processing is None or len(self.post_processing) == 0:
            return None

        start = []
        end = []

        grain_names = {p['granularity'] for p in self.post_processing}
        if 'Monthly' in grain_names or 'GroupMonthly' in grain_names:
            reset = DateOffset(**{"day": 1, "hour": 0, "minute": 0, "second": 0, "microsecond": 0})
            start.append(reset)
            start.append(DateOffset(n=n, **{"months": 1}))
            if not incremental:
                end.append(reset)
        elif 'Weekly' in grain_names or 'GroupWeekly' in grain_names:
            today = pd.Timestamp('today').weekday()
            reset = DateOffset(**{"weekday": 6, "hour": 0, "minute": 0, "second": 0, "microsecond": 0})
            start.append(reset)
            if today == 6: # starting at Sunday
                # offset resetting go forward for weekday, but for same weekday as indicated, it stasy at today, 
                # hence just need to go back 1 week
                start.append(DateOffset(n=n, **{"weeks": 1}))
            else:
                # offset resetting go forward for weekday, hence need to go back 2 weeks
                start.append(DateOffset(n=n+1, **{"weeks": 1}))
            if not incremental:
                end.append(reset)
        elif 'Daily' in grain_names or 'GroupDaily' in grain_names:
            reset = DateOffset(**{"hour": 0, "minute": 0, "second": 0, "microsecond": 0})
            start.append(reset)
            start.append(DateOffset(n=n, **{"days": 1}))
            if not incremental:
                end.append(reset)
        elif 'Hourly' in grain_names or 'GroupHourly' in grain_names:
            reset = DateOffset(**{"minute": 0, "second": 0, "microsecond": 0})
            start.append(reset)
            start.append(DateOffset(n=n, **{"hours": 1}))
            if not incremental:
                end.append(reset)
        elif 'Minute' in grain_names or 'GroupMinute' in grain_names:
            reset = DateOffset(**{"second": 0, "microsecond": 0})
            start.append(reset)
            start.append(DateOffset(n=n, **{"minutes": 1}))
            if not incremental:
                end.append(reset)
        else:
            self.logger.warning('unknown granularities=%s, no backtrack processing', self.post_processing)
            return None

        return [start, end]

    def execute(self, df=None, start_ts=None, end_ts=None, entities=None):
        """Execute this pipeline.

        Once a pipeline is created and configured properly, this method runs the pipeline. This method knows 
        whether the model is trained or not, and trains it first if not yet before scores.

        Parameters
        ----------
        df : `DataFrame`, optional
            The upstream output dataframe for the pipeline. The pipeline always loads data from the data lake, 
            no matter this parameter is given or not. If given, the output from this pipeline is concatenated to 
            the upstream output first before returned. Default is None.
        start_ts : str, optional
            The start of the range of the history to be retrieved, inclusive. Default is None, meaning all the way 
            back the earliest record. It is given in the format like '2019-01-31 06:51:34.234561', with time portion 
            optional.
        end_ts : str, optional
            The end of the range of the history to be retrieved, exclusive. Default is None, meaning all the way 
            to the latest record. It is given in the format like '2019-01-31 06:51:34.234561', with time portion 
            optional.
        entities : `list` of `str`, optional
            A list of assets to filter. Only these assets are processed by the pipeline. Default is 
            no filtering. Note that the asset are given by the format `<asset_id>-____-<site_id>`.

        Returns
        -------
        `DataFrame`
            A dataframe containing the generated prediction results.
        """

        self.logger.info('Starting execution of %s Asset Group Pipeline...', self.model_template_name if self.model_template_name is not None else 'Unspecified')
        self.logger.debug('Received input DataFrame: df=%s', log_df_info(df, 5, logger=self.logger, log_level=logging.DEBUG))

        self.df_traces = {self.name: {}}
        self.df_traces[self.name]['input'] = df.copy() if df is not None else None

        df_upstream = df

        pipeline = self._entity_type.get_calc_pipeline()

        # add asset cache refresher (preloader)
        # TODO for unknown reason, adding this preload function does not work on AS side (but works locally...)
        # pipeline.add_stage(AssetCacheRefresher(self.asset_group_id, data_items=list(self.pipeline_config['inputs'])))

        self.pipeline_config = self._get_pipeline_config()


        if self.pipeline_config.get('use_cm', False):
            use_cm= self.pipeline_config.get('use_cm', False)
            cm_code= self.pipeline_config.get('cm_code', 'CM')
            self.logger.debug('Using CM, cm_code=%s', cm_code)
                
        if  self.pipeline_config.get('use_aggregated_data_in_monitor', False):
            self.logger.debug('Initializing Asset Loader, using aggregated data in monitor.')
            loader = AggregatedAssetLoader(
                asset_group=self.asset_group_id,
                _entity_type=self._entity_type,
                data_items=list(self.pipeline_config['inputs']),
                names=list(self.pipeline_config['renamed_inputs']),
                resamples=self.pipeline_config['features_resampled'],
                entity_type_metadata=self.entity_type_metadata,
                asset_device_mappings=self.config.get('asset_device_mappings', None), 
                fillna=self.config['fillna'],
                fillna_exclude=self.prepare_fillna_exclude(self.config['fillna_exclude'], self.pipeline_config),
                dropna=self.config['dropna'],
                dropna_exclude=self.prepare_dropna_exclude(self.config['dropna_exclude'], self.pipeline_config),
                use_cm = self.pipeline_config.get('use_cm', False),
                cm_code = self.pipeline_config.get('cm_code', 'CM'))


        else:
            self.logger.debug('Initializing Asset Loader.')
            loader = AssetLoader(
                asset_group=self.asset_group_id,
                _entity_type=self._entity_type,
                data_items=list(self.pipeline_config['inputs']),
                names=list(self.pipeline_config['renamed_inputs']),
                resamples=self.pipeline_config['features_resampled'],
                entity_type_metadata=self.entity_type_metadata,
                asset_device_mappings=self.config.get('asset_device_mappings', None), 
                fillna=self.config['fillna'],
                fillna_exclude=self.prepare_fillna_exclude(self.config['fillna_exclude'], self.pipeline_config),
                dropna=self.config['dropna'],
                dropna_exclude=self.prepare_dropna_exclude(self.config['dropna_exclude'], self.pipeline_config),
                use_cm = self.pipeline_config.get('use_cm', False),
                cm_code = self.pipeline_config.get('cm_code', 'CM'))

        if self.data_substitution is not None:
            for entity_type in self.data_substitution:
                loader.set_data_substitution(entity_type, self.data_substitution[entity_type])

        pipeline.add_stage(loader)

        if self.training:
            model_config = self._get_pipeline_config()

        # pass the pipeline to sub-class for constructing the pipeline stagess
        # safe-guard the user given model pipeline config by copying before passing to `prepare_execute()`
        model_config = dict(self.pipeline_config)
        for key, value in model_config.items():
            if isinstance(value, list) or isinstance(value, dict):
                model_config[key] = value.copy()

        #Need to pass self.pre_trained_model to the Estimator
        model_config['pre_trained_model'] = self.pre_trained_model

        self.prepare_execute(pipeline, model_config)

        # pass down the model version 'timestamp' to all estimators in the pipeline
        for stage in pipeline.stages:
            if isinstance(stage, BaseEstimator):
                stage.local_model = self.local_model
                if self.training or self.model_timestamp is None or stage.__class__.__name__ not in self.model_timestamp:
                    stage.model_timestamp = None
                else:
                    stage.model_timestamp = self.model_timestamp[stage.__class__.__name__]

        # the following must come after prepare_execute() since get_prediction_backtrack() could need 
        # self.pipeline_config which could be changed by prepare_execute()
        start_ts = pd.Timestamp(start_ts) if isinstance(start_ts, str) else start_ts
        end_ts = pd.Timestamp(end_ts) if isinstance(end_ts, str) else end_ts
        
        if not self.training and start_ts is not None and end_ts is None:
            # when scoring on AS runtime, if start_ts is given without end_ts, it must be ran 
            # with scheduling, hence we need to determine our backtrack (from Predict side)
            model_config = dict(self.pipeline_config)
            for key, value in model_config.items():
                if isinstance(value, list) or isinstance(value, dict):
                    model_config[key] = value.copy()
            
            backtrack = self.get_prediction_backtrack(model_config, incremental=self.incremental_summary)
            if backtrack is not None and (isinstance(backtrack, list) or isinstance(backtrack, tuple)) and len(backtrack) == 2:
                self.logger.debug('Computed backtrack: start_ts=%s, end_ts=%s, to adjust scheduled prediction based on backtrack=%s', start_ts, end_ts, backtrack)
                try:
                    launch_time = pd.Timestamp('today')
                    if backtrack[0] is not None and isinstance(backtrack[0], list):
                        # Kewei add the if statement because the backtack =10 days in Monitor, start_ts=2021-04-12 , now= 2021-04-22, PMLIB ignores the start_ts passed by Monitor.
                        # we only set the start_ts to launch_time when start_ts is None. If Monitor passes in start_ts, we will use it.
                        if start_ts is None:
                            start_ts = launch_time
                        self.logger.debug('start_ts=%s, end_ts=%s, to adjust scheduled prediction before the for loop ' % (start_ts, end_ts))
                        for offset in backtrack[0]:
                            start_ts -= offset
                    end_ts = launch_time
                    if backtrack[1] is not None and isinstance(backtrack[1], list):
                        for offset in backtrack[1]:
                            end_ts -= offset
                    self.logger.debug('start_ts=%s, end_ts=%s, with backtrack adjusted', start_ts, end_ts)
                except:
                    raise RuntimeError('invalid backtrack=%s' % str(backtrack))
        elif 'features_resampled' in self.pipeline_config and len(self.pipeline_config['features_resampled']) > 0:
            # TODO iotfunctions currently has defect in db.read_agg when eaither only start_ts is given or only end_ts is given
            if start_ts is not None and end_ts is None:
                # if just start_ts is given, use current time as end_ts
                end_ts = pd.Timestamp('today')
            elif start_ts is None and end_ts is not None:
                # reject if only end_ts is given
                raise ValueError('currently when resampling is used, it is not supported to give just end_ts without also giving start_ts')

        # iotfunctions treats end_ts as inclusive when getting sensor data from table, so subtract 1 microsecond
        if end_ts is not None:
            end_ts -= pd.Timedelta(microseconds=1)
            self.logger.debug(
                'iotfunctions treats end_ts as inclusive when reading sensor data, so subtracting 1 microsecond to account for this: end_ts=%s', 
                end_ts)

        if end_ts is not None and pd.Timestamp(end_ts) >= pd.Timestamp.max:
            end_ts = None

        # run asset cache refresher (preloader) first
        if not api.is_local_mode():
            asset_refresher = AssetCacheRefresher(self.asset_group_id, data_items=list(self.pipeline_config['inputs']), db=self.db, db_schema=self.db_schema)
            asset_refresher.execute(start_ts=start_ts, end_ts=end_ts, entities=entities)

        # now execute the pipeline
        df_predicted = pipeline.execute(to_csv=self.debug, start_ts=start_ts, end_ts=end_ts, entities=entities)

        # once executed, check if any newly trained model
        model_timestamp = dict()
        trained_models = []
        df_traces = dict()
        training_timestamp = None
        for stage in pipeline.stages:
            # get the last training timestamp
            if isinstance(stage, BaseEstimator) and stage.training_timestamp is not None:
                training_timestamp = stage.training_timestamp
                model_timestamp[stage.__class__.__name__] = stage.training_timestamp

                for model_name, model in stage.models.items():
                    stage_model_paths = [model_name]
                    extras = stage.model_extras[model_name]
                    for extra_path, extra_obj, picket_dump, binary in extras:
                        stage_model_paths.append(extra_path)
                    trained_models.append((stage, model, stage_model_paths))

            if getattr(stage, 'df_traces', None) is not None:
                df_traces[stage.name] = stage.df_traces

        self.trained_models = trained_models
        self.df_traces.update(df_traces)
        if self.training and self.training_timestamp != training_timestamp:
            self.training_timestamp = training_timestamp
            self.new_training = True
            self.model_timestamp = model_timestamp

            # clear up training state so this object can be used for prediction
            # also since we don't want to load training features for prediction, refresh the inputs 
            # of model pipeline config here

            self.training = False
            self.pipeline_config = self._get_pipeline_config()

        self.logger.debug('new_training=%s, training_timestamp=%s, model_timestampe=%s', self.new_training, self.training_timestamp, self.model_timestamp)

        if df_predicted is None or df_predicted.empty:
            if self.training and self.new_training is False:
                raise RuntimeError('training failed, probably there is not enough data')

            # no prediction generated from this pipeline, 
            # if no upstream, create an empty dataframe with proper index and prediction output columns
            # if with upstream, simply append prediction output columns with value None
            if df_upstream is None:
                df = pd.DataFrame(columns=[self._entity_type._df_index_entity_id, self._entity_type._timestamp] + self.pipeline_config['predictions'])
                df = df.set_index([self._entity_type._df_index_entity_id, self._entity_type._timestamp])
            else:
                df = df_upstream
                for col in self.pipeline_config['predictions']:
                    df[col] = None
        else:
            df = df_predicted[self.pipeline_config['predictions']]
            df = df.dropna(how='all')

            if df_upstream is not None and not df_upstream.empty:
                # concat() cannot handle duplicate multi-index, hence when there's upstream and we need 
                # to concat them, we must remove dupcliate rows (on index) first. this is a rare situation 
                # since in normal cases our asset group pipeline does not expect an upstream except in 
                # cases multiple asset group pipelines in an entity type, in which case we simply remove 
                # duplicate rows (duplicated timestamp only actually) which is fine for our purpose.

                self.logger.debug('df_upstream=%s', log_df_info(df_upstream, head=0, logger=self.logger, log_level=logging.DEBUG))
                df_upstream = df_upstream.loc[~df_upstream.index.duplicated(keep='last')]
                self.logger.debug('df_upstream_dedup=%s', log_df_info(df_upstream, head=0, logger=self.logger, log_level=logging.DEBUG))

                self.logger.debug('df=%s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))
                df = df.loc[~df.index.duplicated(keep='last')]
                self.logger.debug('df_dedup=%s', log_df_info(df, head=0, logger=self.logger, log_level=logging.DEBUG))

            # still want to concat DFs, even if incoming is empty
            df = pd.concat([df_upstream, df], axis=1)
            

        self.df_traces[self.name]['output'] = df

        
        if self.new_training:
            self.new_training_output_df = df.copy()

        # close connection for DBModelStore at the end of execute
        # TO DO Need to talk to Monitor to get flag to use COS or DB2/postgresql
        #if hasattr(self.db, 'model_store'):
        #    ibm_db.close(self.db.model_store.db_connection)

        self.logger.debug('Pipeline final DF=%s', log_df_info(df, head=5, include_missing_value_count=True, logger=self.logger, log_level=logging.DEBUG))
        self.logger.info('Finished execution of %s Asset Group Pipeline.', self.model_template_name if self.model_template_name is not None else 'Unspecified')
        return df

    def predict(self, df=None, start_ts=None, end_ts=None, entities=None):
        """Generate prediction results with this pipeline.

        Note that the pipeline must be trained first before calling this method.

        Parameters
        ----------
        df : `DataFrame`, optional
            The upstream output dataframe for the pipeline. The pipeline always loads data from the data lake, 
            no matter this parameter is given or not. If given, the output from this pipeline is concatenated to 
            the upstream output first before returned. Default is None.
        start_ts : str, optional
            The start of the range of the data to be retrieved, inclusive. Default is None, meaning all the way 
            back the earliest record. It is given in the format like '2019-01-31 06:51:34.234561', with time portion 
            optional.
        end_ts : str, optional
            The end of the range of the data to be retrieved, exclusive. Default is None, meaning all the way 
            to the latest record. It is given in the format like '2019-01-31 06:51:34.234561', with time portion 
            optional.
        entities : `list` of `str`, optional
            A list of assets to filter. Only these assets are processed by the pipeline. Default is 
            no filtering. Note that the asset are given by the format `<asset_id>-____-<site_id>`.

        Returns
        -------
        `DataFrame`
            A dataframe containing the generated prediction results.

        Raises
        ------
        `RuntimeError`
            If the pipeline is not yet trained.
        """

        if self.training:
            raise RuntimeError('complete the model training before calling this method to predict')

        if start_ts is not None and end_ts is None:
            # for manual scoring with only start_ts but not end_ts, avoid entering the 
            # special logic of backtrack determination which is for running on AS runtime only
            end_ts = pd.Timestamp.max

        return self.execute(df=df, start_ts=start_ts, end_ts=end_ts, entities=entities)

        
    def register(self, url=None, template_only=False, model_instance_name=None, model_instance_desc=None, write_initial_result=True,upgrade=False, **kwargs):
        """Register the trained model.

        Once a model is successfully trained by running the `pmlib.pipeline.AssetGroupPipeline.execute`, this 
        method must be called in order to register the newly trained model instance to PMI so that it can be 
        used to generate prediction results for future incoming data.

        Note that the model remains inactive right after being registered. Reliability Engineer is expected to 
        review it to activate it from UI or by REST API.

        Parameters
        ----------
        url : str, optional
            Default is None. This parameter is only necessary for registering custom models. If 
            using out-of-the-box models, this parameter must not be given. For custom models, the 
            given url points to a location from where `pip install` can retrieve the package. It 
            can be a zip/tar/tgz source archive file, or it can be a source control project on 
            GitHub. Remember, it must point to a location accessible to PMI (not behind any 
            firewall).

            Note that if this parameter is not given for custom models, by defualt the system uses 
            `pmlib.pipeline.SimpleCustomAssetGroupPipelineLoader` as the loader and proxy for the 
            custom models.
        template_only : bool, optional
            Whether register only model template but not the trained model instance. Default is False.
        model_instance_name : str, optional
            The model instance's name to be registered for the model. Default is None, which uses 
            `<asset_group_id>_<self.__class__.__name__>_<datetime.utcnow().isoformat()>`.
        model_instance_desc : str, optional
            The model instance's description to be registered for the model. Default is None, which 
            simply be the same 'model_instance_name'.
        write_initial_result : bool, optional
            Set to True to write the initial result from the training to the data lake. If you are in a SaaS
            environment, leaving this value at True will cause a large spike in Predict's usage, depending 
            on how much sensor data exists for your assets. Because of this, we recommend setting this to 
            False when in a SaaS environment. Even if this value is False, the predictions will be still 
            be wrote to the predictions table when scoring runs in Monitor. Default True.
        upgrade: bool, optional
            Set to True to save the model instaance to the KPI_MODEL_STORE table only. It will keep all the Prediction results.
            Default is False.
        """

        self.logger.info('Beginning registration of model to Maximo Predict...')

        # reject if no trained model available yet
        # if loading back a trained model instance, you can still register it again (to update something)
        if template_only != True and not self.new_training:
            if self.model_timestamp is None:
                raise RuntimeError('complete the model training before registering to the system')
            elif self.loaded_model_instance is None:
                raise RuntimeError('load trained model by its model instance id before updating its registration')

        catalog_config_name = self.__class__.__name__
        catalog_config_target = '%s.%s' % (self.__class__.__module__, self.__class__.__name__)
        catalog_config_input_addition = []
        kpi_config_input_addition = {}

        self.logger.debug('target_pipeline_class=%s', catalog_config_target)

        model_template_id, model_template_name, model_template_desc = catalog_config_name, self.model_template_name, self.model_template_desc
        
        #Kewei code change for Digital Twin POC
        try:
            model_template_id= kwargs['model_template_id']
        except:
            pass
        
        self.logger.debug('Model Template ID: %s', model_template_id)
        
        if url is None or re.compile(r'(git[+])?https?://(\w+)@([^@]+)@(\w+)', re.I).match(url):
            if url is None:
                url = '%s/ibm/pmi/service/rest/ds/%s/%s/lib/download?filename=%s' % (
                    self.apm_api_baseurl, 
                    self.apm_id, 
                    self.apm_api_key, 
                    __package__
                    )
            self.logger.debug('This class name: %s', self.__class__.__name__)
            if all([self.__class__.__name__ != cls for cls in [
                    'AnomalyDetectionAssetGroupPipeline',
                    'DegradationCurveAssetGroupPipeline',
                    'FailurePredictionAssetGroupPipeline',
                    'TimeToFailureAssetGroupPipeline',
                    'WmlScoringAssetGroupPipeline']]):

                # TODO dill==3.0.0 cannot deal with classes with __init__ explicitly defined well, for now reject if any
                has_init = False
                if '__init__' in self.__class__.__dict__:
                    self.logger.debug('Found the current class %s has the __init__ method defined, which is not allowed', self.__class__.__name__)
                    has_init = True
                else:
                    pipeline = self._entity_type.get_calc_pipeline()
                    model_config = dict(self.pipeline_config)
                    for key, value in model_config.items():
                        if isinstance(value, list) or isinstance(value, dict):
                            model_config[key] = value.copy()
                    self.prepare_execute(pipeline, model_config)
                    has_init = any(['__init__' in stage.__class__.__dict__ for stage in pipeline.stages])
                    if not has_init:
                        bases = []
                        for stage in pipeline.stages:
                            bases.extend(stage.__class__.__bases__)
                        for cls in bases:
                            if cls.__module__ == self.__class__.__module__:
                                if '__init__' in cls.__dict__:
                                    has_init = True
                                    break
                                bases.extend(cls.__bases__)
                if has_init:
                    raise RuntimeError('Classes with __init__ explicitly defined are not supported to directly register, you have to create your own Python module/project to register it.')

                catalog_config_name = model_template_id
                catalog_config_target = '%s.%s' % (SimpleCustomAssetGroupPipelineLoader.__module__, SimpleCustomAssetGroupPipelineLoader.__name__)
    
                # change model instance name and description to include target pipeline class name
                if model_instance_name is None:
                    model_instance_name = '%s_%s_%s' % (self.asset_group_id, self.__class__.__name__, datetime.utcnow().isoformat())
                if model_instance_desc is None:
                    model_instance_desc = model_instance_name

                # dump and save custom pipeline class to COS
                cos_path = '/'.join(['apm', 'pmi', 'model_pipeline', self.asset_group_id, self.__class__.__name__, 'pipeline'])
                self.logger.debug('Saving pipeline class to cos_path=%s', cos_path)

                if api.is_local_mode():
                    try:
                        _mkdirp(cos_path)
                    except:
                        pass
                    with open(cos_path, mode='wb') as file:
                        file.write(dill.dumps(self.__class__, recurse=True))

                    # this method is really not meant for local mode, but the only reason to call it in local 
                    # model is to write out custom model object to local FS for testing purpose, so 
                    # SimpleCustomAssetGroupPipelineLoader can load it back
                    return None
                else:
                    # work-around to be able to dill with recurse

                    cos_kpi = os.environ.get('COS_BUCKET_KPI')
                    if cos_kpi is not None:

                        bucket = None
                        try:
                            bucket = self._entity_type.db.credentials['config']['bos_runtime_bucket']
                        except:
                            raise RuntimeError('unable to find AS COS runtime bucket')
                        ret = self._entity_type.db.cos_client._cos_api_request('PUT', bucket=bucket, key=cos_path, payload=dill.dumps(self.__class__, recurse=True), binary=True)
                        if ret is None:
                            raise RuntimeError('failed saving custom pipeline class to COS')
                    else:
                        try:
                            if self._entity_type.db.model_store.entity_type_id is None:
                                #_entity_type_id
                                #self._entity_type.db.model_store.entity_type_id = get_entity_type_id(self._entity_type.logical_name)
                                #self._entity_type.db.model_store.entity_type_id = self._entity_type._entity_type_id
                                self._entity_type.db.model_store.entity_type_id = get_entity_type_id(self._entity_type.logical_name)
                            self.logger.debug('Begin of Saving to KPI Model Store with db.model_store.entity_type_id=%s', self._entity_type.db.model_store.entity_type_id)
                            self.logger.debug('Begin of Saving to KPI Model Store model=dill.dumps(self.__class__, recurse=True),user_name=None,serialize=False')
                            self._entity_type.db.model_store.store_model(model_name=cos_path,model=dill.dumps(self.__class__, recurse=True),user_name=None,serialize=False)
                            self.logger.debug('End of Saving to KPI Model Store with db.model_store.entity_type_id=%s', self._entity_type.db.model_store.entity_type_id)
                        except:
                            raise RuntimeError('unable to save custom model to KPI_MODEL_STORE.')
                        
                # TODO how can we reject complex custom pipeline which uses external modules not available on AS?
        else:
            if self.__class__.__module__ == '__main__':
                raise RuntimeError('classes of local module "__main__" cannot be registered')

            # TODO do a verfication here to fail fast

        # register to catalog first, then create a KPI based on the instance


        #If it is upgrade , then just save the new model trained with new python run time , for example, trained with python3.10
        if  upgrade :
            self.logger.debug('UPGRADE CASE, will just update the model instance')

            for estimator, model, model_paths in self.trained_models:
                estimator._save_model(bucket=estimator.get_bucket_name(), new_model=model, suffix=estimator.training_timestamp, local=False)
                #estimator._save_model_df_trace(bucket=estimator.get_bucket_name(), suffix=estimator.training_timestamp, local=False, training=True)
                #uploaded_models.setdefault(estimator.name, list()).extend(model_paths)
            #self.logger.debug('Uploading trained models: uploaded_models=%s', uploaded_models)
            return None

        

        catalog_config = {
            'name': catalog_config_name,
            'description': catalog_config_name,
            'moduleAndTargetName': catalog_config_target,
            'url': url,
            'category': 'TRANSFORMER',
            'tags': [],
            'output': [
                    {
                        'name': 'names',
                        'description': 'Provide a list of output names to be generated from the pipeline.',
                        'dataType': 'ARRAY',
                        'jsonSchema': {
                            'minItems': 1,
                            '$schema': 'http://json-schema.org/draft-07/schema#',
                            'type': 'array',
                            'items': {
                                'type': 'string'
                            }
                        },
                        'tags': []
                    }
            ],
            'input': [
                {
                    'name': 'asset_group_id',
                    'type': 'CONSTANT',
                    'required': True,
                    'dataType': 'LITERAL',
                },
                {
                    'name': 'model_pipeline',
                    'type': 'CONSTANT',
                    'required': True,
                    'dataType': 'JSON',
                },
                {
                    'name': 'fillna',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'LITERAL',
                    'values': ['backfill', 'bfill', 'ffill', 'pad'],
                },
                {
                    'name': 'fillna_exclude',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'ARRAY',
                    'dataTypeForArray': ['LITERAL'],
                    'jsonSchema': {
                        'minItems': 1,
                        '$schema': 'http://json-schema.org/draft-07/schema#',
                        'type': 'array',
                        'items': {
                            'type': 'string'
                        }
                    }
                },
                {
                    'name': 'dropna',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'LITERAL',
                    'values': ['any', 'all'],
                },
                {
                    'name': 'dropna_exclude',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'ARRAY',
                    'dataTypeForArray': ['LITERAL'],
                    'jsonSchema': {
                        'minItems': 1,
                        '$schema': 'http://json-schema.org/draft-07/schema#',
                        'type': 'array',
                        'items': {
                            'type': 'string'
                        }
                    }
                },
                {
                    'name': 'asset_device_mappings',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'JSON',
                },
                {
                    'name': 'model_timestamp',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'JSON',
                },
                {
                    'name': 'local_model',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'BOOLEAN',
                },
                {
                    'name': 'incremental_summary',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'BOOLEAN',
                },
                # AS does not allow changing inputs, so below are obsolete but still kept
                {
                    'name': 'apm_id',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'LITERAL',
                },
                {
                    'name': 'apm_api_baseurl',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'LITERAL',
                },
                {
                    'name': 'apm_api_baseurl',
                    'type': 'CONSTANT',
                    'required': False,
                    'dataType': 'LITERAL',
                }
            ]
        }
        
        
        #Kewei Digital Twin begin
        """ if model_template_id not in [
                    'AnomalyDetectionAssetGroupPipeline',
                    'DegradationCurveAssetGroupPipeline',
                    'FailurePredictionAssetGroupPipeline',
                    'TimeToFailureAssetGroupPipeline',
                    'WmlScoringAssetGroupPipeline']:
            catalog_config['input'].extend(catalog_config_input_addition) """
        #end of Digital Twin

        #Kewei Digital Twin begin
        catalog_config_input_addition.append({
                        'name': 'target_pipeline_name',
                        'type': 'CONSTANT',
                        'required': True,
                        'dataType': 'LITERAL',
                    })
        kpi_config_input_addition['target_pipeline_name'] = self.__class__.__name__
        #end of Digital Twin
            
        catalog_config['input'].extend(catalog_config_input_addition)

        # Post model template to PMI first

        model_template_body={
            'modelTemplateId': model_template_id,
            'modelTemplateName': model_template_name,
            'modelTemplateDesc': model_template_desc,
            'modelJson': catalog_config,
        }

        self.logger.debug('Registering model template...')
        resp = api.register_model_template(model_template_body)
        self.logger.debug('Model registration response: %s', resp)
        if resp is None:
            raise RuntimeError('failed registering model template')

        if template_only == True:
            return None

        # next, register the trained model instance

        # register the asset group entity type first
        self._entity_type.register()
        resp = None
        is_91_instance = api.monitor_health_check()
        if (is_91_instance is True):
            resp = self._entity_type.register()
        else:
            resp = api.register_90_model(self._entity_type)

        # upload models

        uploaded_models = dict()
        if self.new_training:
            for estimator, model, model_paths in self.trained_models:
                estimator._save_model(bucket=estimator.get_bucket_name(), new_model=model, suffix=estimator.training_timestamp, local=False)
                estimator._save_model_df_trace(bucket=estimator.get_bucket_name(), suffix=estimator.training_timestamp, local=False, training=True)
                uploaded_models.setdefault(estimator.name, list()).extend(model_paths)
            self.logger.debug('Uploading trained models: uploaded_models=%s', uploaded_models)
        elif self.loaded_model_instance is not None:
            uploaded_models = self.loaded_model_instance['modelCosPath']
        else:
            #assert not 'this should never happen'
            self.logger.debug('This should never happen in uploading models. It is either new training or self.loaded_model_instance is not None')

        kpi_config = dict()

        
        if self.new_training:
            kpi_config = {
                'functionName': model_template_id, 
                'enabled': False,
                'input': self.config.copy(),
                'output': {
                    'names': self.pipeline_config['predictions']
                }
            }


            kpi_config['input'].update(kpi_config_input_addition)
            kpi_config['input']['model_timestamp'] = self.model_timestamp

            if len(self.pipeline_config['predictions']) > 0:
                kpi_config['outputMeta'] = {}
            for out in self.pipeline_config['predictions']:
                data_type = 'NUMBER' # default is NUMBER in AS when no dataType is given

                model_output_dtypes = self.new_training_output_df[self.pipeline_config['predictions']].dtypes.to_dict() if not self.new_training_output_df.empty and all([name in self.new_training_output_df.columns for name in self.pipeline_config['predictions']]) else dict()
                self.logger.debug('model_output_dtypes=%s', model_output_dtypes)

                if model_output_dtypes is not None and out in model_output_dtypes:
                    self.logger.debug(
                        'out=%s, model_output_dtypes[out]=%s, is_string_dtype=%s, is_bool_dtype=%s, is_datetime64_any_dtype=%s, is_numeric_dtype=%s', 
                        out, 
                        model_output_dtypes[out], 
                        pd.api.types.is_string_dtype(model_output_dtypes[out]), 
                        pd.api.types.is_bool_dtype(model_output_dtypes[out]), 
                        pd.api.types.is_datetime64_any_dtype(model_output_dtypes[out]), 
                        pd.api.types.is_numeric_dtype(model_output_dtypes[out])
                    )

                    if pd.api.types.is_bool_dtype(model_output_dtypes[out]):
                        data_type = 'BOOLEAN'
                    elif pd.api.types.is_numeric_dtype(model_output_dtypes[out]):
                        data_type = 'NUMBER'
                    elif pd.api.types.is_string_dtype(model_output_dtypes[out]):
                        data_type = 'LITERAL'
                    elif pd.api.types.is_datetime64_any_dtype(model_output_dtypes[out]):
                        data_type = 'TIMESTAMP'

                kpi_config['outputMeta'][out] = {'dataType': data_type}

            # dummy one to trigger AS to pass the backtrack adjusting to this pipeline
            kpi_config['backtrack'] = {'days':0, 'hours':0, 'minutes':0}
        elif self.loaded_model_instance is not None:
            kpi_config = self.loaded_model_instance['modelKpi']
        else:
            #assert not 'this should never happen'
            self.logger.debug('This should never happen. It is either a new training or loaded_model_instance is not None')

        # dummy one to trigger AS to pass the backtrack adjusting to this pipeline
        if self.post_processing is not None:
            for post in self.post_processing:
                post['backtrack'] = {'days':0, 'hours':0, 'minutes':0}

        self.logger.debug('kpi_config=%s', kpi_config)

        # Post model instance next

        if self.new_training:
            if model_instance_name is None:
                model_instance_name = '%s_%s_%s' % (self.asset_group_id, kpi_config['functionName'], datetime.utcnow().isoformat())
            if model_instance_desc is None:
                model_instance_desc = model_instance_name
        elif self.loaded_model_instance is not None:
            model_instance_name = self.loaded_model_instance['modelInstanceName']
            model_instance_desc = self.loaded_model_instance['modelInstanceDesc']
        else:
            #assert not 'this should never happen'
            self.logger.debug('This should never happen. It is either a new training or loaded_model_instance is not None')

        self.logger.debug('Final kpi_config before registering to Monitor: kpi_config=%s', kpi_config)

        model_instance_body = {
            'modelTemplateId': kpi_config['functionName'],
            'modelCosPath': uploaded_models, # format of uploaded_models is like {'DegradationCurveEstimator': ['apm/pmi/model/AssetGroupThree/DegradationCurveEstimator/degradation_curve_WEIBULL_1549957842']}
            'instanceName': model_instance_name,
            'instanceDesc': model_instance_desc,
            'kpiConfig': kpi_config, # kpi_config is the payload to be sent to AS for creating KPIs
            'granularity': self.granularity,
            'postProcessing': self.post_processing,
            'publishedOutputs': self.published_outputs,
        }
        #self.logger.debug('model_instance_body='+json.dumps(model_instance_body))
        resp = api.register_model_instance(self.asset_group_id, model_instance_body)
        self.logger.debug(resp)
        if resp is None:
            raise RuntimeError('failed registering model instance')

        # once registration complete successfully, set the flag to prevent unnecessary registration again
        self.new_training = False

        resp = json.loads(resp.text)
        self.logger.debug('Registed model instance with response: %s', resp)

        self.model_instance_id = resp['modelInstanceId']

        # once registration succeeds, write the initial prediction results directly

        # Kewei comment oout to test __write in the notebook
        self.logger.debug('Registration succeeded. Writing initial prediction results...')
        
        try:
            if write_initial_result and self.new_training_output_df is not None and not self.new_training_output_df.empty:       
                self.enable(enabled=True,schedule={"starting_at": "01:00:01", "every": "1D"})
                self._write(self.new_training_output_df)

        except:
            self.logger.debug('Error when writing initial prediction results...')
            return self.model_instance_id

        # safe-guard even though the model instance just registered is not yet enabled
        self._validate_checkpoint()

        self.logger.info('Registration was successful. New model ID = %s', self.model_instance_id)

        return self.model_instance_id

    def unregister(self, model_instance_id, force=False):
        """
        Unregister and cleanup the given model instance, including all its generated results.

        This method removes everything related to the given model instance. You get the model instance's 
        ID from the response of `pmlib.pipeline.AssetGroupPipeline.register` method.

        By default, only non-activated model instances can be unregistered. You can use parameter `force` 
        to forcefully unregister activated model instances.

        Parameters
        ----------
        model_instance_id : str
            The id of the model instance to be deleted.
        force : bool, optional
            If the given model instance is still activated, should it still be deleted forcefully? Default is False.
        """

        if model_instance_id is None:
            raise ValueError('parameter model_instance_id must not be None')

        self.logger.info('Unregistering model instance...')
        self.logger.debug('Args: model_instance_id=%s, force=%s', model_instance_id, force)

        # first get back model instance definition to know its model template id
        resp = api.get_model_instance(self.asset_group_id, model_instance_id)
        if resp is None:
            self.logger.warning('Cannot find model_instance_id=%s, please check if it is a valid one', self.model_instance_id)
            return

        model_instance = resp.json()
        model_template_id = model_instance['modelTemplateId']
        self.logger.debug('Found model template ID: %s', model_template_id)

        # now delete the model instance
        resp = api.unregister_model_instance(asset_group_id=self.asset_group_id, model_instance_id=model_instance_id, force=force)
        self.logger.debug('Sent API request to delete model instance. Response: %s', resp)
        if resp is None:
            self.logger.warning('Could not delete model instance. See response: %s', resp)
            return

        # delete model template at last, if successfully delete the model instance
        # let the API to take care if it can be deleted

        resp = api.unregister_model_template(model_template_id)
        self.logger.debug('Sent API request to delete model instance.')
        if resp is None:
            self.logger.warning('Could not delete model template.')
        else:
            self.logger.info('Successfully unregistered model instance.')

    def enable(self, enabled=True, schedule=None, backtrack=None):
        """Eanble/disable the trained model's scoring.

        Ater training the model and having it registered, this method can be used to enable it to start scoring 
        for new data coming in. If you load a previously trained model by creating the asset group pipeline object 
        by model instance ID, this method can also be used to enable/disable the scoring of a previously trained 
        model.

        When enabling, you also get to specify the schedule of when and how often the scoring should run, by 
        parameter `schedule`.

        Note that parameter `backtrack` currently does not have any effect (no-op).

        Parameters
        ----------
        enabled : `bool`, optional
            To enable or disable the model's scoring. Default is True (enable).
        schedule: `dict`, optional
            Specify when and how often the scoring should run. This dict can use two keys: `starting_at` and `every`, 
            either is optional. `starting_at` specifies the base time, or next starting time, for schduling scoring. 
            It can be either a `pandas.Timestamp` object, anything acceptable by `pandas.Timestamp`, or simply a 
            string like '11:23' representing daily base starting time. The default is mid-night if it is not given 
            in the dict. `every` specifies the frequency of the scoring, which can be either `pandas.DateOffset`
            or anything (`str`, `tuple`, `datetime.timedelta`, `pandas.DateOffset`) parsable by 
            `pandas.tseries.frequencies.to_offset`, typically given like `1day` (runs every day) or `15min` 
            (runs every 15 minutes), or `10hour`(runs every 10 hours), or `2week`(runs every 2 weeks). The default is no recurring scheduling, i.e. one time scoring, if not given in 
            the dict. The default for this dict is None, which means no specific scheduleing and the scoring is 
            launched every 5 minutes (with best effort).
        backtrack: `dict`, optional
            Not used currently.
        """

        if self.model_instance_id is None:
            raise RuntimeError('only trained model instance can be enabled, train it first or load a previously trained one by its model instance ID')

        # first add missing granularities
        monitor_api.add_missing_granularities(self.asset_group_id, CUSTOM_GRANULARITY + self.granularity)

        self.logger.info('Enabling model scoring...')
        self.logger.debug('Args: enabled=%s, schedule=%s, backtrack=%s', enabled, json.dumps(schedule), json.dumps(backtrack))
        resp = api.enable_model_instance(asset_group_id=self.asset_group_id, model_instance_id=self.model_instance_id, enabled=enabled, schedule=schedule, backtrack=backtrack)
        self.logger.debug('Attempted enabling model instance with response: %s', resp)
        if resp is None:
            raise RuntimeError('failed %s model instance' % ('enabling' if enabled else 'disabling'))

        if enabled and schedule is not None:
            self._validate_checkpoint()

        self.logger.info('Finished enabling model scoring.')

        return resp

    def _validate_checkpoint(self):
        # workaround an issue in AS that when all kpis have schedule in a type, it will never progress 
        # because of the lack of type-level checkpoint record. forcefully insert one if none exists when 
        # a model instance is enabled and scheduled.
        if not api.is_local_mode():
            entity_type_id = self.db.entity_type_metadata[self.asset_group_id].get('entityTypeId', None) if isinstance(self.db.entity_type_metadata[self.asset_group_id], dict) else self._entity_type._entity_type_id
            if entity_type_id is None:
                self.logger.warn('cannot find entity_type_id of entity_type=%s', self.asset_group_id)
                return

            self.db.start_session()
            try:
                checkpoint_table = self.db.get_table(self._entity_type.checkpoint_table, self.db_schema)
                if self.db.connection.execute(select([func.count()]).select_from(checkpoint_table).where(and_(checkpoint_table.c['entity_type_id'] == entity_type_id, checkpoint_table.c['entity_id'] == ''))).first()[0] == 0:
                    self.logger.debug('entity_type=%s with entity_type_id=%s does not have any type-level checkpoint yet', self.asset_group_id, entity_type_id)

                    self.db.connection.execute("insert into %s%s (entity_type_id, entity_id, key, timestamp) values (%s, '', '', '%s')" % (('%s.' % self.db_schema) if self.db_schema is not None else '', checkpoint_table.name, entity_type_id, pd.Timestamp.utcnow()._short_repr))

                    self.logger.debug('entity_type=%s with entity_type_id=%s type-level checkpoint inserted', self.asset_group_id, entity_type_id)
                else:
                    self.logger.debug('entity_type=%s with entity_type_id=%s already has type-level checkpoint', self.asset_group_id, entity_type_id)
            except:
                self.db.session.rollback()
                #raise
            finally:
                self.db.commit()

    def _write(self, df):
        '''
        This method is intended for notebook usage to write the model outputs to the data lake.

        Normally, the writing of model outputs is AS responsibility and only by running the model 
        pipeline on the AS server side can users get to write the outputs to the data lake. This 
        method is meant for the testing purpose, making it more convenient for data scientists.

        This method not only writes the model outputs, but also generates post processing defined
        for the pipeline. Normally this is not possible due to the fact the code processing 
        aggregation is on AS server side. Here, as a temporary measure, simple code is added to 
        handle the aggregation logic, but only supporting simple aggregation. Any unsupported 
        aggregation would be ignored and warned when running this method.

        Note that this is a temporary measure before AS moves all its server side logic to the 
        client library 'iotfuncitons', at which time we shall replace this method with whatever 
        is available in 'iotfunctions'.

        # TODO once AS moves all persistence logic to iotfunctions, replace this method
        '''
        meta = {
            'frequencies': DEFAULT_FREQUENCY,
            'granularities': DEFAULT_GRANULARITY + self.granularity,
        }

        self.logger.info('Writing model to data lake...')
        self.logger.debug('Meta before performing write operations: meta=%s', meta)

        # update frequencies and granularities with contents from DB
        if not api.is_local_mode():

            # TODO: Health Check API Here
            is_91_instance = api.monitor_health_check()
            entity_type_id = api.get_entity_type_id_by_entity_type_name(self._entity_type.logical_name)
            asset_group_uuid = api.get_uuid_by_entity_type_name(self.db,self._entity_type.logical_name)
            self.logger.debug('Retrieved asset_group_uuid=%s', asset_group_uuid)
            if (is_91_instance is True):
                meta_remote = self._entity_type.db.http_request(object_type='input', object_name=str(entity_type_id), request='GET')
            else:
                meta_remote = self._entity_type.db.http_request(object_type='input', object_name=str(asset_group_uuid), request='GET')
            try:
                meta_remote = json.loads(meta_remote)
            except (TypeError, json.JSONDecodeError):
                raise RuntimeError('API call to server did not retrieve valid entity type properties. No metadata received.')

            meta['frequencies'] = meta_remote['frequencies']

            local_grains = {granularity['name']:granularity for granularity in meta['granularities']}
            remote_grains = {granularity['name']:granularity for granularity in meta_remote['granularities']}
            for grain, granularity in remote_grains.items():
                if grain in local_grains:
                    # update local content with remote one
                    local_grains[grain].update(granularity)
                else:
                    # missing grain needs to be created on AS
                    local_grains[grain] = granularity
        self.logger.debug('Meta before building granularities: meta=%s', meta)
        granularities = self._entity_type.build_granularities(grain_meta=meta.get('granularities'), freq_lookup=meta.get('frequencies'))
        self.logger.debug('Built granularities=%s', granularities)

        # also write any post-processing aggregation metrics

        # only support simple aggregation in local mode, never intend to be fully implemented
        supported_aggs = {
            'Count': 'count',
            'First': 'first',
            'Last': 'last',
            'Maximum': 'max',
            'Mean': 'mean',
            'Median': 'median',
            'Minimum': 'min',
            'Product': 'product',
            'StandardDeviation': 'std',
            'Sum': 'sum',
            'Variance': 'var',
        }

        grain_dfs = {None: df}

        data_item_grain_lookup = {v:None for v in self.pipeline_config['predictions']}
        if self.post_processing is not None:
            processing_queue = self._get_kpi_dependency_tree_processing_queue(pipeline=self.post_processing, raw_metrics_set=set(self.pipeline_config['predictions']))

            grain_grouped_aggregators = OrderedDict()
            for post_processing in processing_queue:
                kpi = post_processing.kpi
                func_name = kpi['functionName']
                if (not kpi['enabled'] or
                        func_name not in supported_aggs or
                        'granularity' not in kpi or
                        kpi['granularity'] is None):
                    self.logger.warning('unsupported post_processing=%s in local mode', post_processing)
                    continue

                if kpi['granularity'] not in grain_grouped_aggregators:
                    grain_grouped_aggregators[kpi['granularity']] = list()
                grain_grouped_aggregators[kpi['granularity']].append(kpi)
            self.logger.debug('grain_grouped_aggregators=%s', grain_grouped_aggregators)

            for grain, kpis in grain_grouped_aggregators.items():
                dep_grain = None

                aggregation_meta = []
                agg_dict = defaultdict(list)
                for kpi in kpis:
                    func_input = kpi['input']['source'] # only simple aggregators are supported
                    func_output = kpi['output']['name'] # only simple aggregators are supported
                    func_name = supported_aggs[kpi['functionName']]
                    agg_dict[func_input].append(func_name)

                    aggregation_meta.append((func_input, func_name, func_output))

                    data_item_grain_lookup[func_output] = grain

                    if dep_grain is None:
                        # just use the first input's grain as dependent grain
                        dep_grain = data_item_grain_lookup[func_input]
                self.logger.debug('aggregation_column_methods=%s', agg_dict)

                df_grain = grain_dfs[dep_grain]
                if df_grain is None:
                    raise RuntimeError('something wrong with post-processing configuration: dependency_grain=%s' % dep_grain)

                original_index = df_grain.index.names
                df_grain = df_grain.reset_index()
                self.logger.debug('df_grain after index reset = %s', log_df_info(df_grain, head = 5, comment = f'Grain: {grain}', logger=self.logger, log_level=logging.DEBUG))
                
                #For MAS8.9 ,self._entity_type._df_index_entity_id=id, self._entity_type._entity_id=deviceid, the following line make the df_grain.rename_axis fails
                
                #df_grain = df_grain.rename(columns={self._entity_type._df_index_entity_id: self._entity_type._entity_id})
                self.logger.debug('df_grain after index reset and modifying column name = %s', log_df_info(df_grain, head = 5, comment = f'Grain: {grain}', logger=self.logger, log_level=logging.DEBUG))
                # self.logger.debug('About to group the df_grain by granularities[grain].grouper = %s', granularities[grain].grouper)
                #groups = df_grain.groupby(granularities[grain].grouper)
                #df_grain = groups.agg(agg_dict)


                df_grain_grouped = pd.DataFrame()

                id_col_name= self._entity_type._df_index_entity_id
                self.logger.debug(' id_col_name=%s', id_col_name)


                for id_val, grouped_df in df_grain.groupby(id_col_name):
    
                    grouped_df.set_index('evt_timestamp',inplace=True)
                    df1 = grouped_df.resample(granularities[grain].freq).agg(agg_dict)
                    df1.reset_index(inplace=True,drop=False)
    
                    df1[id_col_name] = id_val
                    df1 = df1.dropna()
                    df_grain_grouped = pd.concat([df_grain_grouped, df1], ignore_index=True)



                df_grain_grouped.set_index([id_col_name,'evt_timestamp'],inplace=True)

                self.logger.debug('after groups.agg(agg_dict) before df_grain.rename_axis =%s', log_df_info(df_grain, head=5, logger=self.logger, log_level=logging.DEBUG))

                df_grain= df_grain_grouped


                self.logger.debug('after groups.agg(agg_dict) before df_grain.rename_axis =%s', log_df_info(df_grain, head=5, logger=self.logger, log_level=logging.DEBUG))
                #df_grain = df_grain.rename_axis([self._entity_type._df_index_entity_id if label == self._entity_type._entity_id else label for label in df_grain.index.names])
                self.logger.debug('aggregation_after_groupby_df_grain=%s', log_df_info(df_grain, head=5, logger=self.logger, log_level=logging.DEBUG))

                renamed_cols = {}
                for src, func_name, name in aggregation_meta:
                    renamed_cols['%s|%s' % (src, func_name)] = name if name is not None else src
                self.logger.debug('renamed_cols=%s', renamed_cols)

                new_columns = []
                for col in df_grain.columns:
                    if len(col[-1]) == 0:
                        new_columns.append('|'.join(col[:-1]))
                    else:
                        new_columns.append('|'.join(col))
                df_grain.columns = new_columns

                if len(renamed_cols) > 0:
                    df_grain = df_grain.rename(columns=renamed_cols)

                grain_dfs[grain] = df_grain

                self.logger.debug('aggregation_df_grain=%s', log_df_info(df_grain, head=5, logger=self.logger, log_level=logging.DEBUG))

        for grain, df_grain in grain_dfs.items(): 
            self.logger.debug('Iterating over grain DataFrames dictionary. Current iteration: %s', grain)
            if grain is None:
                grain_tuple = None
                target_table_name = 'dm_%s' % self.asset_group_id
            else:
                grain_tuple = (granularities[grain].freq, granularities[grain].dimensions, granularities[grain].entity_id is not None)
                
                target_table_name = granularities[grain].table_name
            self.logger.debug('While iterating, found target_table_name=%s', target_table_name)

            target_table_name = target_table_name.lower()

            writer = PersistColumns(
                target_grain=grain,
                target_grain_tuple=grain_tuple,
                target_table=target_table_name,
                db=self._entity_type.db,
                db_schema=self._entity_type._db_schema,**self.kwargs)
            writer.execute(df_grain)

        return grain_dfs
    
    def _get_latest_prediction_timestamp(self, predictions: List[str]=None) -> pd.Timestamp:        
        """Returns the latest timestamp for prediction data for this asset group pipeline. Returns 
        None if there are no predictions.

        Args:
            predictions (List[str]): predictions to use in predictions table; if None, will use `self.pipeline_config['predictions']` in its place

        Returns:
            pd.Timestamp: latest prediction timestamp
        """
        prediction_table_name = self._get_prediction_table_name()
        self.logger.debug(
            'Retrieved prediction table name: %s', prediction_table_name)
        if prediction_table_name is None:
            return None

        db = api._get_db()
        with db.engine.connect() as conn:
            df = pd.read_sql(
                f"SELECT KEY, MAX(TIMESTAMP) FROM {self.db_schema}.{prediction_table_name} GROUP BY KEY;", conn)
        if df.empty:
            return None
        
        if predictions == None and 'predictions' in self.pipeline_config:
            predictions = self.pipeline_config['predictions']
        else:
            return None

        prediction_timestamps = {row['KEY']: row['2']
                                 for index, row in df.iterrows()}

        self.logger.debug('Predictions to check table for: %s', predictions)
        self.logger.debug(
            'All predictions in predictions table: %s', prediction_timestamps.keys())

        for prediction in predictions:
            if prediction in prediction_timestamps:
                return pd.Timestamp(prediction_timestamps[prediction])
        return None

    def _get_prediction_table_name(self) -> str:
        """Returns the prediction table name for this models predictions. Returns None if the 
        table name cannot be found.

        Returns:
            str: prediction table name
        """
        # get data items
        data_items = []
        for entity_type in self.entity_type_metadata.values():
            if entity_type['name'] == self.asset_group_id:
                data_items = entity_type['dataItemDto']
                break
        
        # and now find table name
        if 'predictions' not in self.model_pipeline:
            return None
        
        for data_item in data_items:
            if data_item['name'] in self.model_pipeline['predictions']:
                return data_item['sourceTableName']
        
        # will only get here if we fail to find table name
        return None 

    def _get_kpi_dependency_tree_processing_queue(self, pipeline, raw_metrics_set):
        kpi_tree, sidecar_items, processing_queue = self._parse_kpi_dependency_tree(pipeline=pipeline, raw_metrics_set=raw_metrics_set)

        # fill all level 1 into the queue (level 0, raw ones do not need to be inserted)
        leveled_items = defaultdict(list)
        for name, tn in kpi_tree.items():
            # self.logger.debug('last pass data item: ' + str(tn))
            if tn.tree_level() == 1 and tn.name not in sidecar_items:
                processing_queue.append(tn)
            elif tn.tree_level() > 1:
                leveled_items[tn.tree_level()].append(tn)

        # now push those with level > 1, in the order of level (2, 3, 4, etc...)
        for level in sorted(leveled_items.keys()):
            for tn in leveled_items[level]:
                if tn.name not in sidecar_items:
                    processing_queue.append(tn)
        
        return processing_queue

    def _parse_kpi_dependency_tree(self, pipeline, raw_metrics_set, derived_metrics_set=set()):
        # TODO once AS moves all persistence logic to iotfunctions, replace this method

        processing_queue = list()
        
        kpi_tree = dict()

        # one function can have multiple outputs and it only needs to be invoked once to 
        # generate all outputs. since we are creating one treenode per item, we can not 
        # put all output items in the returned queue but just one of them, which 
        # would avoid invoking the same instance multiple times. here we use a set to 
        # store those names not to be put in the final queue
        sidecar_items = set()

        self.logger.debug('raw_metrics_set=%s, derived_metrics_set=%s', raw_metrics_set, derived_metrics_set)

        for kpi in pipeline:
            name = self._get_kpi_targets(kpi)
            source = self._get_kpi_sources(kpi)
            grain = set()
            # if kpi.get(KPI_FUNCTION_GRANULARITY_KEY, None) is not None:
            #     grain = kpi[KPI_FUNCTION_GRANULARITY_KEY]
            #     grain = set(grain[1])

            raw_source_nodes = []
            source_nodes = []
            
            raw_sources = (source | grain) & raw_metrics_set
            derived_sources = (source | grain) - raw_sources
            # derived_sources = (source | grain) & derived_metrics_set

            self.logger.debug('kpi_raw_sources=%s, kpi_derived_sources=%s, kpi=%s', raw_sources, derived_sources, kpi)
            
            # 1st pass can only create nodes for raw sources
            for s in raw_sources:
                tn = _KpiTreeNode(name=s, kpi=None, dependency=None, level=0)
                kpi_tree[s] = tn
                raw_source_nodes.append(tn)
                source_nodes.append(tn)

            # for derived ones, simply insert their names first
            source_nodes.extend(derived_sources)

            # for each target, create a treenode with all source nodes as dependency
            for idx, n in enumerate(name):
                kpi_tree[n] = _KpiTreeNode(name=n, kpi=kpi, dependency=source_nodes)
                if idx > 0:
                    sidecar_items.add(n)

            # if len(name) == 0 and len(source) == 0:
            #     # it has neither input nor output (data items), put it up front always
            #     processing_queue.append(_KpiTreeNode(name='%s_%s' % (kpi[KPI_FUNCTION_FUNCTIONNAME_KEY], util.randomword(8)), kpi=kpi, dependency=[]))

        # 2nd pass to replace any string source reference with _KpiTreeNode source (could only be derived metrics)
        for k, v in kpi_tree.items():
            if v.dependency is not None:
                for idx, dep in enumerate(v.dependency):
                    if not isinstance(dep, _KpiTreeNode):
                        v.dependency[idx] = kpi_tree[dep]
                        kpi_tree[dep].children.add(v)

        return (kpi_tree, sidecar_items, processing_queue)

    def _get_kpi_targets(self, kpi):
        # TODO once AS moves all persistence logic to iotfunctions, replace this method

        targets = list()

        for key in kpi.get('output', {}):
            target = kpi['output'][key]
            if isinstance(target, str):
                targets.extend([t.strip() for t in target.split(',') if len(t.strip()) > 0])
            elif isinstance(target, list) and all([isinstance(t, str) for t in target]):
                targets.extend(target)

        return targets

    def _get_kpi_sources(self, kpi):
        # TODO once AS moves all persistence logic to iotfunctions, replace this method

        sources = list()

        for key in kpi.get('input', {}):
            if key == 'source':
                source = kpi['input'][key]
                if isinstance(source, str):
                    sources.extend([t.strip() for t in source.split(',') if len(t.strip()) > 0])
                elif isinstance(source, list) and all([isinstance(t, str) for t in source]):
                    sources.extend(source)

        return set(sources)


class _KpiTreeNode(object):
    # TODO once AS moves all persistence logic to iotfunctions, replace this method

    def __init__(self, name, kpi, dependency=None, level=None):
        self.name = name
        self.kpi = kpi
        if dependency is None or isinstance(dependency, list):
            self.dependency = dependency
        else:
            self.dependency = [dependency]
        self._level = level
        self.children = set()

    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        return self.name == other.name
    
    def tree_level(self):
        if self._level is not None or self.dependency is None:
            return self._level
        elif self.dependency and len(self.dependency) > 0:
            return max([d.tree_level() if isinstance(d, _KpiTreeNode) else 0 for d in self.dependency]) + 1
        else:
            return 1

    def __repr__(self):
        if self.dependency is not None:
            return '(%s) %s <- %s' % (self.tree_level(), self.name, str(['(%s) %s' % (dep.tree_level(), dep.name) for dep in self.dependency]))
        else:
            return '%s' % (self.name)


class _ModelPipelineConfig(dict):
    '''Initialize this pipeline.

    This mainly takes care of proper handling of the gievn features, features_for_training, predictions, and features_resampled. Features, features_for_training, and features_resampled together determin what the input data items should be.

    All the kwargs are made keys in this `dict` at the end, which means, any new configuration keys are automatically made available in the resultant model pipeline config `dict`. Two extra special keys, inputs and renamed_inputs, are added as the results of the processing of the features and features_for_training for loading data.

    Parameters
    ----------
    features : `list` of `str`, optional
        The array of feature data items given in the format of <type>:<name>:<new_name> where <new_name> is optional and default to the original name. <type> can be other entity type, while <name> must be a valid data item name in that entity type. When referring to asset's attributes, use empty string for <type>, ex ':name:new_name'.
    features_for_training : `list` of `str`, optional
        Features used for training only. while in training, both features and features_for_training are loaded; while in prediction, only features are loaded. its usage is same as features.
    predictions : `list` of `str`, optional
        The array of predicted data item names. Optional, if not given, the model is not used for prediction.
    features_resampled : `dict`, optional
    '''

    SEPARATOR = ':'

    def __init__(self, db, entity_type_metadata, training=True, features=None, features_for_training=None, predictions=None, features_resampled=None, **kwargs):
        self.logger = get_logger(self)

        features = self.parse_data_item_name(features) if features is not None else []
        self['features'] = remove_list_duplicate([feature[2] for feature in features])
        inputs = [_ModelPipelineConfig.SEPARATOR.join([feature[0], feature[1]]) for feature in features]
        renamed_inputs = [feature[2] for feature in features]

        # we don't use 'targets' any more
        # if features_for_training is None and 'targets' in kwargs:
        #     features_for_training = kwargs['targets']

        features_for_training = self.parse_data_item_name(features_for_training) if features_for_training is not None else []
        self['features_for_training'] = remove_list_duplicate([feature[2] for feature in features_for_training])

        # self['targets'] are used by iotfunctions BaseEstimatorFunction only for generating model file name
        self['targets'] = [feature[1] for feature in features_for_training] # TODO not really used since all related methods are overriden

        # features_for_training need to be loaded together with features when training, when in training mode
        if training:
            inputs.extend([(_ModelPipelineConfig.SEPARATOR.join([feature[0], feature[1]])) for feature in features_for_training])
            renamed_inputs.extend([feature[2] for feature in features_for_training])

        if len(inputs) == 0 and training:
            # it is possible to have empty inputs when scoring, like the degradation model
            raise ValueError('features and features_for_training cannot both be empty for training')

        # validate there is no duplicate feature/feature_for_training names (after renaming)
        all_feature_names = remove_list_duplicate(self['features'] + self['features_for_training'])
        if len(features + features_for_training) != len(all_feature_names):
            duplicated_names = set()
            j = 0
            for i, name in enumerate(renamed_inputs):
                if j >= len(self['features']) or name != self['features'][j]:
                    duplicated_names.add(name)
                else:
                    j += 1

            duplicated_features = []
            for name in duplicated_names:
                duplicated_features.append([input for i, input in enumerate(inputs) if renamed_inputs[i] == name])
            raise ValueError('features and features_for_training cannot have duplicate names: %s' % duplicated_features)

        # self['predictions'] are used by BaseEstimatorFunction for final df column names
        self['predictions'] = [] if predictions is None else predictions.copy()

        # features_resampled parsing, transformed into a dict of (entity_type_name, (time_grain, agg_methods, agg_outputs))
        self['features_resampled'] = self._parse_features_resampled(all_features=features + features_for_training, features_resampled=features_resampled, db=db, entity_type_metadata=entity_type_metadata)
        for feature_type, rest in self['features_resampled'].items():
            t, m, outputs = rest
            for feature, outs in outputs.items():
                if feature in self['features']:
                    idx = self['features'].index(feature)
                    self['features'][idx:idx+1] = outs
                elif feature in self['features_for_training']:
                    idx = self['features_for_training'].index(feature)
                    self['features_for_training'][idx:idx+1] = outs

        self['inputs'] = tuple(inputs)
        self['renamed_inputs'] = tuple(renamed_inputs)

        self.logger.debug('Initializing ModelPipelineConfig Dict with the following parameters: features=%s, features_for_training=%s, predictions=%s, features_resampled=%s, inputs=%s, renamed_inputs=%s', 
                          self['features'], 
                          self['features_for_training'], 
                          self['predictions'], 
                          self['features_resampled'], 
                          self['inputs'], 
                          self['renamed_inputs'])

        # deep copy kwargs values to kwards dict
        for key, value in kwargs.items():
            if isinstance(value, list) or isinstance(value, dict):
                kwargs[key] = value.copy()
        
        # then, added kwards to self
        self.update(kwargs)

        self.logger.debug('Added kwargs to ModelPipelineConfig: %s', kwargs)

    def __setitem__(self, key, value):
        if key in ['inputs', 'renamed_inputs'] and key in self:
            raise ValueError('key=%s is immutable')
        super().__setitem__(key, value)

    def parse_data_item_name(self, names):
        '''This method parses the given array of names of format <entity_type>:<original_name>:<new_name>.

        It returns an array of 3-element array, 1st being the entity type nmae, 2nd the original data item 
        name within that entity type, and 3rd being the new name to be used (renaming).

        The 1st part entity type can be empty string, which means the data item is from the asset, not 
        sensors.

        The last (3rd) part of each name can be ignored which would by default use the original name.
        In this case, it does not matter whether the last colon (between 2nd and 3rd element) is given.

        Note that it is allowed to given a string without colon, in which case it represents an asset's 
        data item which would no tbe renamed.
        '''
        parsed_names = []
        for name in names:
            if name.find(_ModelPipelineConfig.SEPARATOR) == -1:
                name = '%s%s' % (_ModelPipelineConfig.SEPARATOR, name)
            name = [n.strip() for n in name.split(_ModelPipelineConfig.SEPARATOR)]
            if len(name) < 3:
                name.append(name[1])
            elif len(name[2]) == 0:
                name[2] = name[1]
            parsed_names.append(name)
        return parsed_names

    def _parse_features_resampled(self, all_features, features_resampled, db, entity_type_metadata):
        if features_resampled is None:
            features_resampled = dict()

        all_feature_types = {f[0] for f in all_features if f[0] != ''} # get all device entity types used
        all_features = [f for f in all_features if f[0] != ''] # get all device entity types used

        invalid_resampled_types = {type_name for type_name in features_resampled if type_name not in all_feature_types}
        if len(invalid_resampled_types) > 0:
            raise ValueError('unknown feature_type=%s in model_pipeline.features_resampled' % invalid_resampled_types)

        transformed_features_resampled = dict()
        for type_name, resamples in features_resampled.items():
            if not isinstance(resamples, dict):
                raise ValueError('invalid resample_config=%s of feature_type=%s in model_pipeline.features_resampled' % (str(resamples), type_name))

            if '${freqency}' not in resamples:
                raise ValueError('missing ${freqency}, invalid resample_config=%s of feature_type=%s in model_pipeline.features_resampled' % (str(resamples), type_name))

            time_grain = resamples['${freqency}']

            # get all feature names (after renamed) of this type
            features_of_type = {f[2] for f in all_features if f[0] == type_name}

            agg_methods, agg_outputs = dict(), dict()
            for name, method_n_output in {name:method_n_output for name, method_n_output in resamples.items() if name != '${freqency}'}.items():
                if name not in features_of_type:
                    raise ValueError('unused or invalid feature_name=%s, invalid resample_config=%s of feature_type=%s in model_pipeline.features_resampled' % (name, str(resamples), type_name))
                else:
                    features_of_type.remove(name)

                for method, output in method_n_output.items():
                    agg_methods.setdefault(name, []).append(method) 
                    output = output if output is not None else ('${data_item}_${method}' if len(method_n_output) > 1 else '${data_item}')
                    # variable substitution
                    output = re.sub(r'(\${(data_item)})', name, output, flags=re.I)
                    output = re.sub(r'(\${(method)})', method, output, flags=re.I)
                    agg_outputs.setdefault(name, []).append(output)

            # for remaining types not explicitly specified
            for name in features_of_type:
                if api.is_local_mode():
                    entity_type = api.get_entity_type(entity_type_name=type_name, db=db)
                    if entity_type.db.get_as_datatype(entity_type.table.c[name]) == 'NUMBER':
                        method = 'mean'
                    else:
                        method = 'max'
                else:
                    method = 'mean'
                    for dto in entity_type_metadata[type_name]['dataItemDto']:
                        if dto['name'] == name:
                            method = 'mean' if dto['columnType'] == 'NUMBER' else 'max'
                agg_methods.setdefault(name, []).append(method) 
                agg_outputs.setdefault(name, []).append(name)

            time_grain, agg_methods, agg_outputs = api._validate_resampling(entity_type_name=type_name, db=db, time_grain=time_grain, agg_methods=agg_methods, agg_outputs=agg_outputs)

            transformed_features_resampled[type_name] = (time_grain, agg_methods, agg_outputs)

        return transformed_features_resampled


class WmlScoringAssetGroupPipeline(AssetGroupPipeline):
    """An asset group pipeline for using custom models deployed on Watson Machine Learning (WML) for scoring.

    You can freely train a model and deploy it to WML to get a deployment ID back. With a WML deployment ID, 
    you can use WML as a scroing service and let Maximo APM Predict to load the data to send to WML for 
    scoring, then get prediction results back. This is easily supported by this class.

    Note that, by training and deploying your custom model to WML, it is your responsibility to ensure any 
    data preprocessing, feature extraction necessary for using yur model for scoring is all in place when using 
    this class. In most cases, it means you must deploy your complete model data pipeline to WML as part of 
    the model's deployment. You must also ensure the data configured for this pipeline to load meets your 
    model's WML deployment request schema.

    Here's a typical example of how to create an object of this class:

    ```
    WmlScoringAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
            'predictions': ['predicted_output'],
            'wml_deployment_uid': 'c4312345-7890-aazz-bbyy-5xx51155ee00',
            'result_value_index': [0]
        })
    ```

    This model has two special `model_pipeline` configuration for tuning further:

    * `wml_deployment_uid`: `str`

        The deployment ID from WML.

    * `result_value_index`: `list` of `int`

        How to index into the returned response array of array from WML deployment. The typical 
        WML deployment response is an array of array, one element (an array) in the outer array 
        per input data sent in the request to WML. For example, by sending an array of input 
        data [[1, 2, 3], [4, 5, 6]], we get response [[0.1, 1], [0.7, 1]]. In this case, if we 
        specify `result_value_index` as [0], it means using the first element of each sub-array 
        in the response as the scoring result, which gets us the result [0.1, 0.7] 
        (0.1 for [1, 2, 3], and 0.7 for [4, 5, 6]).
    """

    def __init__(self, asset_group_id, model_pipeline, **kwargs):
        kwargs['model_timestamp'] = {} # not used, but always needed by register()
        if self.__class__ == WmlScoringAssetGroupPipeline:
            kwargs['model_template_name'] = 'Custom Model on Watson Machine Learning'
        super().__init__(asset_group_id=asset_group_id, model_pipeline=model_pipeline, **kwargs)

        self.logger = get_logger(self)

        self.training = False
        self.new_training = True

    def prepare_model_config(self, model_config):
        """This class overrides this method to set the default value to the following two custom model pipeline 
        configuration when not given in the constructor's parameter `model_pipeline`.

        * `cache_model`: by default, if not given, set to False
        * `wml_credentials`: by default, if not given, set to use the default WML instance comes with system

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """

        if 'wml_deployment_uid' not in model_config:
            raise ValueError('model_pipeline.wml_deployment_uid must be given')
        else:
            self.logger.info('wml_deployment_uid=%s', model_config['wml_deployment_uid'])


        if 'wml_credentials' not in model_config or model_config['wml_credentials'] is None:
            wml_credentials = None
            if 'WML_VCAPS' in os.environ:
                try:
                    wml_credentials = json.loads(os.environ['WML_VCAPS'])
                except BaseException as e:
                    raise RuntimeError('error getting environment WML_VCAPS: %s' % e)

            
            model_config['wml_credentials'] = wml_credentials

        #cp4d case, set token to be USER_ACCESS_TOKEN
        wml_client_version = model_config.get('wml_client_version',None)
        if wml_client_version is not None and wml_client_version =='V4':
            if 'USER_ACCESS_TOKEN' in os.environ:
                access_token = os.environ['USER_ACCESS_TOKEN']
                model_config['wml_credentials']['token'] = access_token

        if model_config['wml_credentials'] is None:
            raise ValueError('missing WML credentials, please specify it in model_pipeline.wml_credentials')

        if 'cache_model' not in model_config:
            model_config['cache_model'] = False

    def prepare_execute(self, pipeline, model_config):
        pipeline.add_stage(WmlDeploymentEstimator(**model_config))

class WmlSPSSScoringAssetGroupPipeline(WmlScoringAssetGroupPipeline):
    """An asset group pipeline for using SPSS based custom models deployed on Watson Machine Learning (WML) for scoring.

    You can freely train a model and deploy it to WML to get a deployment ID back. With a WML deployment ID, 
    you can use WML as a scroing service and let Maximo APM Predict to load the data to send to WML for 
    scoring, then get prediction results back. This is easily supported by this class.

    Note that, by training and deploying your custom model to WML, it is your responsibility to ensure any 
    data preprocessing, feature extraction necessary for using yur model for scoring is all in place when using 
    this class. In most cases, it means you must deploy your complete model data pipeline to WML as part of 
    the model's deployment. You must also ensure the data configured for this pipeline to load meets your 
    model's WML deployment request schema.

    Here's a typical example of how to create an object of this class:

    ```
    WmlSPSSScoringAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        model_pipeline={
            'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
            'predictions': ['predicted_output'],
            'wml_deployment_uid': 'c4312345-7890-aazz-bbyy-5xx51155ee00',
            'result_value_index': [0]
        })
    ```

    This model has two special `model_pipeline` configuration for tuning further:

    * `wml_deployment_uid`: `str`

        The deployment ID from WML.

    * `result_value_index`: `list` of `int`

        How to index into the returned response array of array from WML deployment. The typical 
        WML deployment response is an array of array, one element (an array) in the outer array 
        per input data sent in the request to WML. For example, by sending an array of input 
        data [[1, 2, 3], [4, 5, 6]], we get response [[0.1, 1], [0.7, 1]]. In this case, if we 
        specify `result_value_index` as [0], it means using the first element of each sub-array 
        in the response as the scoring result, which gets us the result [0.1, 0.7] 
        (0.1 for [1, 2, 3], and 0.7 for [4, 5, 6]).
    * `deployment_mode` : `online` or `batch`
    """

    #def __init__(self, asset_group_id, model_pipeline, **kwargs):
        #super().__init__(asset_group_id, model_pipeline, **kwargs)

    def prepare_model_config(self, model_config):
        """This class overrides this method to set the default value to the following two custom model pipeline 
        configuration when not given in the constructor's parameter `model_pipeline`.

        * `cache_model`: by default, if not given, set to False
        * `wml_credentials`: by default, if not given, set to use the default WML instance comes with system

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """
        super().prepare_model_config(model_config)
        
    def prepare_execute(self, pipeline, model_config):
        self.logger = logging.getLogger()
        self.logger.setLevel(model_config.get('log_level', 40))
        self.logger.debug(str(model_config))
        self.logger.debug("WmlSPSSScoringAssetGroupPipeline::prepare_execute() - adding the estimator")

        # Add estimator to the pipeline.
        estimator_instance = WmlSPSSDeploymentEstimator(**model_config)
        estimator_instance.init_config(**model_config)
        pipeline.add_stage(estimator_instance)

# TODO should this inherit from base pipeline class or AS base class?
class SimpleCustomAssetGroupPipelineLoader:
    """A simple custom asset group pipeline proxy and loader.

    Creating custom pipeline requires putting the custom pipeline's code somewheree to register 
    to Maximo APM Predict so the code can be fetched by the runtime to generate prediction. 
    Usually, the custom code is put either as a github repository (could be privte) or as 
    a downaloadable source archive somewhere, installable from `pip`.

    This class provides an alternative, easier way for creating custom pipeline.

    It works like this. You write and train custom pipeline class as usual. But instead of packaging 
    it as a library and put it somewhere, you `register()` without giving the `url` parameter. 
    Internally, when registering, the system knows this class is to be used as the loader and proxy 
    and it serializes your custom pipeline class and save it to object storage. The registeration uses
    the name of your custom pipeline class, though with the target pointing to this class.

    Later when scoring runs, this class (the target) knows how to deserialize/load the custom pipeline 
    class back from object storage and then acts as a proxy to forward the scoring request to the
    custom pipeline class.

    Below is a typical skelton example of how this class is used:

    ```
    class MyAnomalyDetectionEstimator(AnomalyDetectionEstimator):
        pass # for simplicity, do nothing here

    class MyAnomalyDetectionAssetGroupPipeline(pmlib.AnomalyDetectionAssetGroupPipeline):
        def prepare_execute(self, pipeline, model_config):
            pipeline.add_stage(MyAnomalyDetectionEstimator(**model_config))

    group = MyAnomalyDetectionAssetGroupPipeline(
                asset_group_id='ID of an asset group',
                model_pipeline={
                    'features': ['DeviceTypeOne:temperature', 'DeviceTypeTwo:Humidity'],
                    'features_for_training': [':faildate'],
                    'predictions': ['anomaly_score', 'anomaly_threshold'],
                    'pre_failure_window_size': 20,
                    'pre_failure_failure_size': 10,
                    'srom_training_options': {
                        'exectype': 'spark_node_random_search'
                    }
                })
    df = group.execute()

    group.register()
    ```

    Note that it is exceptly the same as the normal usage except we don't provide the `url` parameter 
    to the last `register()` call.
    """

    is_data_source = True
    """Asset data loader is a data source in AS.

    Data source functions are executed at the beginning stage of AS pipelines.
    """

    merge_method = 'replace'
    """Asset data loader is primary data source which always replaces upstream dataframe with its own.

    Using replace merge method means our asset data loader is the one responsible for what should be the 
    dataframe passed to downstream. Internally, our asset data loader does all the necessary merging and 
    joining, between the input upstream dataframe and loaded asset data. This merge method makes sure 
    AS does not intervene with this logic.
    """

    def __init__(self, target_pipeline_name, asset_group_id, model_pipeline, **kwargs):
        """
        Parameters
        ----------
        target_pipeline_name : str
            The name of the target pipeline class, usually it should be `target_pipeline_class.__name__`.
        """

        super().__init__()

        self.logger = get_logger(self)
        self.model_template_name = 'Simple Custom Asset Group Pipeline Loader'

        # making sure environment variables necessary for PMI are all set
        api.init_environ()

        try:
            self.logger.debug('in SimpleCustomAssetGroupPipelineLoader init(), asset_group_id=%s', asset_group_id)
        except NameError:
            self.logger.debug("well, asset_group_id WASN'T defined after all!")
        else:
            self.logger.debug("sure, asset_group_id was defined.")

        db = api._get_db(asset_group_id=asset_group_id)

        cos_path = '/'.join(['apm', 'pmi', 'model_pipeline', asset_group_id, target_pipeline_name, 'pipeline'])
        self.logger.debug('in SimpleCustomAssetGroupPipelineLoader init(), cos_path=%s', cos_path)
        if 'local_model' in kwargs and kwargs['local_model'] or api.is_local_mode():
            # local FS
            target_pipeline_object = None
            try:
                with open(cos_path, mode='rb') as file:
                    target_pipeline_object = file.read()
            except FileNotFoundError as e:
                raise RuntimeError('target_pipeline_name=%s cannot be loaded' % target_pipeline_name) from e
            target_pipeline_class = dill.loads(target_pipeline_object) if target_pipeline_object is not None else None
        else:
            
            import os
            cos_kpi = os.environ.get('COS_BUCKET_KPI')
            self.logger.debug('In load_model cos_kpi='+ str(cos_kpi))

            if cos_kpi is not None:
                self.logger.debug('In load_model , load from COS')
                target_pipeline_class = db.cos_load(filename=cos_path, bucket=None, binary=True)
                
            else:
                if db.model_store is not None:
                    self.logger.debug('In load_model , before load from KPI_MODEL_STORE')
                    if db.model_store.entity_type_id is None:
                        db.model_store.entity_type_id = api.get_entity_type_id_by_entity_type_name(asset_group_id)
                    self.logger.debug('before db.model_store.retrieve_model  db.model_store.entity_type_id=%s', db.model_store.entity_type_id)
                    target_pipeline_class =  db.model_store.retrieve_model(cos_path, deserialize=True)
                    self.logger.debug('In load_model , after load from KPI_MODEL_STORE')
                else:
                    self.logger.debug('In load_model, db.model_store is None')
                    
            #target_pipeline_class = db.cos_load(filename=cos_path, bucket=None, binary=True)
        
        self.target_pipeline = target_pipeline_class(asset_group_id=asset_group_id, model_pipeline=model_pipeline, **kwargs)

    def execute(self, df=None, start_ts=None, end_ts=None, entities=None):
        return self.target_pipeline.execute(df=df, start_ts=start_ts, end_ts=end_ts, entities=entities)

    def register(self, url=None, model_template_name=None, model_template_desc=None, model_instance_name=None, model_instance_desc=None, **kwargs):
        raise NotImplementedError('this class cannot be used to register directly')
