# Licensed Materials - Property of IBM
# 5737-M66, 5900-AAA
# (C) Copyright IBM Corp. 2019, 2025 All Rights Reserved.
# US Government Users Restricted Rights - Use, duplication, or disclosure
# restricted by GSA ADP Schedule Contract with IBM Corp.

"""Asset Life Curve.

Statistically, to evaluate the mean life of assets, the sample mean or the average age method is acceptable if a big population has end-of-life information. But asset such as generators, transformers, reactors, cables, etc. have a relatively long life up to and even beyond 40 years and generally there are very limited end-of-life failure data. This algorithm is designed to address this use case (to estimate mean life with limited end-of-lift failure data). In fact the proposed algorithm works best when less than 20% of the assets has end-of-life failure data.

This model predicts Asset Life Curve for a group of assets. In the challenges of asset health assessment, the asset failure probability and its expected remaining life are key aspects to analysis the asset health status.

The Failure Probability Curve model uses statistics distribution to assess the failure probability vs. year, and this model supports 
using either normal distribution or Weiull distribution.
"""

import datetime
import json
import math
import sys




import logging

from scipy.stats import norm


import numpy as np
import pandas as pd
from scipy.stats import norm

from .estimator import BaseEstimator
from .transformer import _BaseTransformer
from .pipeline import AssetGroupPipeline
from .util import get_logger, log_df_info


class DegradationCurveAssetGroupPipeline(AssetGroupPipeline):
    """End of Life Curve model pipeline.

    This `pmlib.pipeline.AssetGroupPipeline` sub-class implements the Asset Life Curve model.

    This model uses asset installation date and decommissioned date as training features. By default, 
    there's no need to specify installation and decommissioned date specifically. If you do specify 
    `features_for_training` in the dict parameter `model_pipeline`, it is ignored. The default 
    `features_for_training` uses `[':installdate', ':statusdate', ':status']`.

    Here's a typical example of how to create an object of this class:

    ```
    DegradationCurveAssetGroupPipeline(
        asset_group_id='ID of an asset group',
        decommissioned_status='UITCOMMISSIE',
        model_pipeline={
            'statistics_distribution_args': {
                'distribution_type': 'WEIBULL',
                'mean_or_scale': None,
                'stddev_or_shape': None
            }
        })
    ```
    This model supports multiple languages of decommissioned_status.
    * decommissioned_status='UITCOMMISSIE',   ( Dutch language)
    
    This model has a special `model_pipeline` configuration `statistics_distribution_args` which provides 
    the following four keys for tuning the model:

    * `distribution_type`: {'NORMAL', 'WEIBULL'}

        The type of distribution to use.

    * `mean_or_scale`: `float`, optional

        Specify mean value when using 'NORMAL' or scale value when using 'WEIBULL' distribution type. Deault 
        is None, which means to calculate from the input data.

    * `stddev_or_shape`: `float`, optional

        Specify standard deviation value when using 'NORMAL' or shape value when using 'WEIBULL' distribution 
        type. Deault is None, which means to calculate from the input data.

    * `snapshot_year`: `int`, optional

        While training, the year to use to calculate asset service time, subtracted the installation date 
        from it, when the asset is not yet removed. Deault is None, which uses current year.

    See base class for details on all the parameters available.
    """
    def __init__(self, **kwargs):
        if self.__class__ == DegradationCurveAssetGroupPipeline:
            kwargs['model_template_name'] = 'End of Life Curve'
            kwargs['model_template_desc'] = 'End of Life Curve'
        kwargs['fillna'] = None
        kwargs['dropna'] = None
        
        try:
            print(kwargs['decommissioned_status'])
            self.decommissioned_status_local = kwargs['decommissioned_status']
        except:
            self.decommissioned_status_local = 'DECOMMISSIONED'

        # special for degradation curve, since both features and features_for_training cannot be none at the same time
        if kwargs.get('model_pipeline', None) is not None:
            kwargs['model_pipeline']['features_for_training'] = [':installdate', ':statusdate', ':status']

            # originally we intend to allow using custom fields when users desire, but the following 
            # code really does not work with other fields, mainly because decommissioned date is from 
            # two fields: status and statusdate. so for now, we just forcefully always use the default 
            # one, even if users accidentally specify it

            # if kwargs['model_pipeline'].get('features_for_training', None) is None:
            #     kwargs['model_pipeline']['features_for_training'] = [':installdate', ':statusdate']

            # features_for_training = kwargs['model_pipeline']['features_for_training']
            # # historical reason, replace :estendoflife, if given, with :statusdate
            # if ':estendoflife' in features_for_training:
            #     features_for_training = [':statusdate' if x==':estendoflife' else x for x in features_for_training]
            # # make sure :status is going to be loaded
            # if ':status' not in features_for_training:
            #     features_for_training.append(':status')

        super().__init__(**kwargs)

    def prepare_model_config(self, model_config):
        """This class validates whether the `statistics_distribution_args` is given and also within it 
        whether `distribution_type` is given as either `NORMAL` or `WEIBULL` (case insensitive).

        See `pmlib.pipeline.AssetGroupPipeline.prepare_model_config`.
        """

        if model_config.get('statistics_distribution_args', None) is None:
            raise ValueError('model_pipeline.statistics_distribution_args must be given')

        if model_config['statistics_distribution_args'].get('distribution_type', None) is None:
            raise ValueError('model_pipeline.statistics_distribution_args.distribution_type must be given')

        model_config['statistics_distribution_args']['distribution_type'] = model_config['statistics_distribution_args']['distribution_type'].upper()
        if model_config['statistics_distribution_args']['distribution_type'] not in ('NORMAL', 'WEIBULL'):
            raise ValueError('model_pipeline.statistics_distribution_args.distribution_type must be either NORMAL or WEIBULL')
            
        return model_config

    def prepare_execute(self, pipeline, model_config):
        """This class overrides this method to use `pmlib.degradation_curve.DegradationCurveEstimator`.

        See `pmlib.pipeline.AssetGroupPipeline.prepare_execute`.
        """
        pipeline.add_stage(DegradationCurveInputDataFilter('status',self.decommissioned_status_local))
        pipeline.add_stage(DegradationCurveEstimator(**model_config))

class DegradationCurveInputDataFilter(_BaseTransformer):
    """This transformer is used as a filter:
    For now, it is used to filter the input asset data. If asset with status = 'DECOMMISSIONED', keep the value of statusdate, otherwise set it to NaT.
    """

    def __init__(self, status_field='status', decommissioned_status='DECOMMISSIONED'):
        super().__init__()
        self.status_field = status_field
        self.decommissioned_status = decommissioned_status

    def execute(self, df, start_ts=None, end_ts=None, entities=None):
        self.logger.debug('df_input=%s', log_df_info(df, head=5, logger=self.logger, log_level=logging.DEBUG))

        df = df[df[self.status_field].str.upper() == self.decommissioned_status]

        self.logger.debug('df_final=%s', log_df_info(df, head=10, logger=self.logger, log_level=logging.DEBUG))

        return df

class DegradationCurveEstimator(BaseEstimator):
    """ Asset Life Curve estimator.

    This sub-class estimator mainly implements `train_model()` to calculate the degradation curve.
    """

    def __init__(self, features, targets, predictions, statistics_distribution_args=None, **kwargs):
        super().__init__(features, targets, predictions, **kwargs)
        self.statistics_distribution_args = statistics_distribution_args
        self.installdate = kwargs['features_for_training'][0]
        self.decommissiondate = kwargs['features_for_training'][1]

    def train_model(self, df):
        # parse statistics_distribution_args
        distribution_type = self.statistics_distribution_args.get("distribution_type")
        mean_or_scale = self.statistics_distribution_args.get("mean_or_scale")
        stddev_or_shape = self.statistics_distribution_args.get("stddev_or_shape")
        snapshot_year = self.statistics_distribution_args.get("snapshot_year")

        # distribution should not be none
        if distribution_type is None:
            raise ValueError('distribution_type must be NORMAL or WEIBULL')
            
            
        degradation_curve = DegradationCurve(data_items=None, statistics_distribution_args=self.statistics_distribution_args, degradation_curve=None, installdate=self.installdate, decommissiondate=self.decommissiondate)
        final_degradation_curve = degradation_curve.execute(df)
        
        #degradation_curve_pipline = DegradationCurve(self.installdate, self.decommissiondate)

        #final_degradation_curve = degradation_curve_pipline.fit(df, distribution_type, mean_or_scale, stddev_or_shape, snapshot_year)

        degradation_curve_model = {
            'final_degradation_curve': final_degradation_curve
        }

        return degradation_curve_model
    
    def get_model_extra(self, new_model, model_path):
        extras = []

        model_json_path = model_path + '_json'
        extras.append((model_json_path, json.dumps(new_model), False, False)) # no pickle dump, not binary

        self.logger.debug('extras=%s', extras)

        return extras

    
    
    
import sys
import logging
import math
from scipy.stats import norm
import pandas as pd
import numpy as np
import datetime

# from .util import log_df_info

class DegradationCurve:
    '''
    The degradation curve supports NORMAL and WEIBULL distributions
    distribution_type: the distribution type, NORMAL or WEIBULL
    mean_or_scale: the mean value of NORMAL or the scale value of WEIBULL
    stddev_or_shape: the standard deviation value of NORMAL or the shape value of WEIBULL
    '''

    def __init__(self, data_items, statistics_distribution_args, degradation_curve, installdate, decommissiondate):
#         self.logger = logging.getLogger('analytics_service.%s.%s' % (self.__module__, self.__class__.__name__))
        self.data_items = data_items
        self.statistics_distribution_args = statistics_distribution_args
        self.degradation_curve = degradation_curve
        self.installdate = installdate
        self.decommissiondate = decommissiondate
        self.snapshot_year = self.statistics_distribution_args.get('snapshot_year',2021)

    def execute (self, df):
        print("df_input: %s" % df.head(5))
        print('df.shape='+str(df.shape))
        df_original = df

        # pick only needed columns for scoring 
#         df = df[list(set(self.data_items) - set(df.index.names))]
        print("df_input_filter: %s" % df.head(5))

        sources_not_in_column=df.index.names
        df = df.reset_index()

        # parse statistics_distribution_args
        distribution_type = self.statistics_distribution_args["distribution_type"]
        mean_or_scale = self.statistics_distribution_args["mean_or_scale"]
        stddev_or_shape = self.statistics_distribution_args["stddev_or_shape"]

        # distribution should not be none
        if distribution_type is None:
            raise('distribution_type should be NORMAL or WEIBULL')

        # using sample data to test degradation curve calculation
#         df_curve_training_data = self.sample_data()
        df_curve_training_data = df
        print('df_curve_training_data.shape='+str(df_curve_training_data.shape))
        print('df_curve_training_data=\n%s' % df_curve_training_data.head(10))

        # fit the degradation curve
#         df_score = df.astype({self._entity_type._timestamp: 'datetime64[ms]'})[[self._entity_type._df_index_entity_id, self._entity_type._timestamp]].copy()
        final_degradation_curve = self.fit(df_curve_training_data, distribution_type, mean_or_scale, stddev_or_shape)
#         df_score[self.degradation_curve] = str(final_degradation_curve).strip('[]')
        print('dfinal_degradation_curve=\n%s' % df.head())

#         df = df_original.merge(df_score, how='left', left_index=True, right_on=[self._entity_type._df_index_entity_id, self._entity_type._timestamp], sort=False)
#         df = df.set_index(keys=sources_not_in_column)
#         print('df_final=\n%s' % df.head())

        return final_degradation_curve


    def fit(self, df_curve_training_data, distribution_type, mean_or_scale, stddev_or_shape):
        print('initial mean_or_scale=%s' % mean_or_scale)
        print('initial stddev_or_shape=%s' % stddev_or_shape)

#         df_curve_training_data[self.installdate] = pd.to_datetime(df_curve_training_data[self.installdate]).dt.year
#         df_curve_training_data[self.decommissiondate] = np.where(pd.notna(df_curve_training_data[self.decommissiondate]), pd.to_datetime(df_curve_training_data[self.decommissiondate]).dt.year, -1)
#         df_curve_training_data = df_curve_training_data.astype({self.installdate: int, self.decommissiondate: int})
        print('df_curve_training_data=%s' % df_curve_training_data.head(100))

        # initialize parameter
        mean_or_scale_final = 0.0
        stddev_or_shape_final = 0.0

        # if mean value and stddev value are specified by user, use them to generate the degradation curve
        if mean_or_scale is not None and stddev_or_shape is not None:
            print('generate degradation curve with user defined parameters')
            mean_or_scale_final = float(mean_or_scale)
            stddev_or_shape_final = float(stddev_or_shape)
        else:
            # generate NORMAL distribution by input data
            if distribution_type == 'NORMAL':
                print('calculate the mean value and stddev value for normal distribution...')
                mean_or_scale_final, stddev_or_shape_final, df_cfp = self.generate_normal_distribution(df_curve_training_data)
                print('calculate done')

            # generate WEIBULL distribution by input data
            if distribution_type == 'WEIBULL':   
                print('calculate the scale value and shape value for normal distribution...')
                mean_or_scale_NORMAL, stddev_or_shape_NORMAL, df_cfp = self.generate_normal_distribution(df_curve_training_data)
                mean_or_scale_final, stddev_or_shape_final = self.generate_weibull_distribution(mean_or_scale_NORMAL, stddev_or_shape_NORMAL, df_cfp)
                print('calculate done')

        # return the final curve
        return self.generate_final_curve(distribution_type, mean_or_scale_final, stddev_or_shape_final)
                
    def generate_normal_distribution(self, df_curve_training_data):
        # df_curve_training_data: [assetId, installationDate, removeDate]
        # step1. pre-processing
        if 'date_service' not in df_curve_training_data.columns :
            #date_service = df_curve_training_data[self.decommissiondate] - df_curve_training_data[self.installdate]
            df_curve_training_data[self.installdate] = pd.to_datetime(df_curve_training_data[self.installdate])
            df_curve_training_data[self.decommissiondate] = pd.to_datetime(df_curve_training_data[self.decommissiondate])
            # date_service = np.ceil( (df_curve_training_data[self.decommissiondate].dt.tz_localize(None) - df_curve_training_data[self.installdate].dt.tz_localize(None))/np.timedelta64(1, 'Y') )
            # Changing np.timedelta64(1, 'Y') to np.timedelta64(365, 'D') as it's no longer supported after numpy 1.26.4
            date_service = np.ceil( (df_curve_training_data[self.decommissiondate].dt.tz_localize(None) - df_curve_training_data[self.installdate].dt.tz_localize(None))/np.timedelta64(365, 'D') )
            print('date_service='+str(date_service))
            df_curve_training_data['retired_flag'] = np.where(date_service>=0, 1, 0)
        
            print('snapshot_year='+str(self.snapshot_year))
            # df_curve_training_data['date_service'] =np.ceil( (df_curve_training_data[self.decommissiondate].dt.tz_localize(None) - df_curve_training_data[self.installdate].dt.tz_localize(None))/np.timedelta64(1, 'Y') )
            # Changing np.timedelta64(1, 'Y') to np.timedelta64(365, 'D') as it's no longer supported after numpy 1.26.4
            df_curve_training_data['date_service'] =np.ceil( (df_curve_training_data[self.decommissiondate].dt.tz_localize(None) - df_curve_training_data[self.installdate].dt.tz_localize(None))/np.timedelta64(365, 'D') )
            #df_curve_training_data['date_service'] = np.where(date_service>=0, date_service, self.snapshot_year - df_curve_training_data[self.installdate])  # should be datetime.datetime.now().year, 2000 as test year for sample data
            print('df_curve_training_data_pre_processing=\n%s' % df_curve_training_data)

        print (df_curve_training_data.shape)
        # step2. calculate the exposed table
        df_exposed_table = df_curve_training_data.groupby(['date_service']).agg({'retired_flag':'sum', 'date_service':'count'})
        df_exposed_table.rename(columns={'date_service': 'pre_exposed_number'}, inplace=True)
        df_exposed_table.sort_index(inplace=True, ascending=False)
        
        df_exposed_table['exposed_number'] = df_exposed_table.pre_exposed_number.cumsum()
        del df_exposed_table['pre_exposed_number']
        df_exposed_table = df_exposed_table.reset_index()
        df_exposed_table.rename(columns={'retired_flag':'retired_number', 'date_service':'age'}, inplace=True)
        print('df_exposed_table=\n%s' % df_exposed_table.head(20))

        # step3. calculate cumulative failure probablity (cfp) table
        # df_exposed_table: [age, retired_number, exposed_number]
        max_age = df_exposed_table['age'].max()
        print('max_age = %s' % str(max_age))
        
        
        max_age_idx = df_exposed_table[df_exposed_table['age'] == max_age].index.values.astype(int)[0] # 0, first row
        print('------------ ' + str(max_age_idx))
        a1 = (df_exposed_table.loc[max_age_idx, 'retired_number']) 
        a2 = (df_exposed_table.loc[max_age_idx, 'exposed_number'])
        if (a1 == a2) : 
            print ("Number of exposed assets = number of retired assets in max_age, meaning all the asset observed failed at max_age")
            # If max_age-1 exist in the df, drop the max_age row as outlier. Some assets may survive after max_age year; we just need enough time to observe.
            # If max_age-1 doesn't exist in the df, modify max_age row to max_age-1
            if ( (max_age-1) == df_exposed_table.loc[max_age_idx+1, 'age'] ):
                #print ("max_age-1 is in !!! drop max_age row")
                df_exposed_table = df_exposed_table.drop(df_exposed_table.index[max_age_idx]) 
            else:    
                df_exposed_table.loc[max_age_idx, 'retired_number'] = 0
                df_exposed_table.loc[max_age_idx, 'age'] = max_age - 1
        #print('df_exposed_table 2 =\n%s' % df_exposed_table.head(200))        
        #print('------------')
        
        
        df_cfp = df_exposed_table[df_exposed_table['retired_number'] !=0]
        print('df_cfp_input=\n%s' % df_cfp.head())
        df_cfp['pre_cumulative_probability'] = df_cfp['retired_number'] / df_cfp['exposed_number']
        
        if df_cfp['age'].min() > 1 : # if there is no aging failure at first year, add a row
            cfp_first_line = [df_cfp['age'].min()-1, -1, -1, 0.001]
            df_cfp = pd.concat([df_cfp, pd.DataFrame(np.array([cfp_first_line]), columns=['age', 'retired_number', 'exposed_number', 'pre_cumulative_probability'])], ignore_index=True)
        print('df_cfp_add_1st dataframe=\n%s' % df_cfp.head(200))
        print (df_cfp[df_cfp['age']==max_age]['retired_number'].head())
        print ('Tony2')
        
        if df_cfp[df_cfp['age']==max_age].empty : # if there is no end-of-life failure in last year
            cfp_last_line = [max_age, -1, -1, 0]
            df_cfp = pd.concat([pd.DataFrame(np.array([cfp_last_line]), columns=['age', 'retired_number', 'exposed_number', 'pre_cumulative_probability']), df_cfp], ignore_index=True)
        print('df_cfp dataframe=\n%s' % df_cfp.head(200))
        df_cfp.sort_values('age', inplace=True)
        df_cfp['cumulative_probability'] = df_cfp.pre_cumulative_probability.cumsum()
        print('df_cfp_cumsum=\n%s' % df_cfp.head())
        print('df_cfp_cumsum=\n%s' % df_cfp.tail())
        
        # If any df_cfp['cumulative_probability'] >=1, The assumption in the paper is violated. 
        
        # Option 1, Fall back to regular mean and std
        if (df_cfp['cumulative_probability'] >= 1).any() : 
            print('Too many decomissioned assets. Switch to regular mean and std.')
            df_decommissioned = df_curve_training_data[df_curve_training_data['retired_flag'] == 1]
            mean_or_scale_NORMAL = df_decommissioned['date_service'].mean()
            stddev_or_shape_NORMAL = df_decommissioned['date_service'].std()
            print('mean_or_scale_NORMAL = %s' % mean_or_scale_NORMAL)
            print('stddev_or_shape_NORMAL = %s' % stddev_or_shape_NORMAL)
            
            df_cfp['cumulative_probability'] = norm.cdf(df_cfp['age'], loc=mean_or_scale_NORMAL, scale=stddev_or_shape_NORMAL) 
            print('df_cfp_tony=\n%s' % df_cfp.head())
            print('df_cfp_tony=\n%s' % df_cfp.tail())
            return mean_or_scale_NORMAL, stddev_or_shape_NORMAL, df_cfp
        '''
        # Option 2, set cumulative_probability 999999% and move on
        df_cfp.loc[df_cfp['cumulative_probability'] >=1, 'cumulative_probability'] = 0.999999
        '''
        
        df_cfp['z'] = norm.ppf(df_cfp['cumulative_probability'])
        print('after sorted df_cfp=\n%s' % df_cfp.head(20))
        print('after sorted df_cfp=\n%s' % df_cfp.tail(20))
        
        
        

        # step4. estimate optimal mean value and stddev value
        #df_cfp: [age, retired_number, exposed_number, pre_cumulative_probability, cumulative_probability, z]
        age_mean = df_cfp['age'].mean()
        z_mean = df_cfp['z'].mean()
        df_cfp['pre_Szx'] = (df_cfp['z'] - df_cfp['z'].mean()) * (df_cfp['age'] - df_cfp['age'].mean())
        df_cfp['pre_Szz'] = (df_cfp['z'] - df_cfp['z'].mean()) * (df_cfp['z'] - df_cfp['z'].mean())
        Szx = df_cfp['pre_Szx'].sum()
        Szz = df_cfp['pre_Szz'].sum()
        stddev_or_shape_NORMAL = Szx / Szz
        mean_or_scale_NORMAL = age_mean - stddev_or_shape_NORMAL * z_mean
        print('mean_or_scale_NORMAL=\n%s' % mean_or_scale_NORMAL)
        print('stddev_or_shape_NORMAL=\n%s' % stddev_or_shape_NORMAL)
        print('before return df_cfp=%s ' % df_cfp.head(200))
        return mean_or_scale_NORMAL, stddev_or_shape_NORMAL, df_cfp

    def obj_func (self, alpha, beta, age_array, SP_array):
        err_objective_iter = 0
        rowcount = len(age_array)
        for i in range(0,rowcount): 
            try:
                err = math.pow(math.log(SP_array[i]) + math.pow((age_array[i] / alpha), beta),2)
            except OverflowError:
                return float('inf')
            err_objective_iter = err_objective_iter + err
        return err_objective_iter    

    def generate_weibull_distribution(self, mean_or_scale_NORMAL, stddev_or_shape_NORMAL, df_cfp):
        # step1. get the survival table based on cfp table
        print('WEIBULL_input_df_cfp=\n%s' % df_cfp)
        df_cfp['survival_probablity'] = np.where(df_cfp['age'] == df_cfp['age'].min(), 1, 1 - df_cfp['cumulative_probability'])
        df_cfp_survival = df_cfp.reset_index()
        df_cfp_survival=df_cfp_survival[df_cfp_survival.survival_probablity>=0]
        print('df_cfp_survival=\n%s' % df_cfp_survival)
        
        age_array = df_cfp_survival['age'].tolist()
        SP_array = df_cfp_survival['survival_probablity'].tolist() # SP for survivial probability
        print('age_array=%s' % age_array)
        print('SP_array=%s' % SP_array)
        rowcount = len(age_array)

        # step2. get initial alpha (scale) and beta (shape)
        '''
        # Step 2.1 use method in paper [12] - Incorporating aging failures in power system reliability evaluation
        # initial beta
        initial_beta = 0.0
        beta_criteia = sys.float_info.max
        for beta in np.arange(0.1, 100, 0.001):
            beta_estimation = ((1+2/beta)**(0.5+2/beta) * math.exp(-(1+2/beta)) * (1 + 1/12/(1+2/beta))) \
                                      / ((1+1/beta)**(1+2/beta) * math.exp(-(2+2/beta)) * (1+1/12/(1+1/beta))**2 * math.sqrt(2*math.pi)) 
            beta_criteia_now = abs(beta_estimation - (1 + stddev_or_shape_NORMAL ** 2 / mean_or_scale_NORMAL ** 2))
            if beta_criteia_now < beta_criteia:
                beta_criteia = beta_criteia_now
                initial_beta = beta
        # initial alpha
        initial_alpha = math.sqrt(stddev_or_shape_NORMAL**2 / ( math.sqrt(2*math.pi)*((1+2/initial_beta)**(0.5+2/initial_beta) \
                                    * math.exp(-(1+2/initial_beta)) * (1 + 1/12/(1+2/initial_beta))) - (2*math.pi*(1+1/initial_beta)**(1+2/initial_beta) \
                                    * math.exp(-(2+2/initial_beta)) * (1+1/12/(1+1/initial_beta))**2) ) )
        
        err_objective_iter = self.obj_func (initial_alpha, initial_beta, age_array, SP_array)
        print('initial_alpha=%s' % initial_alpha)
        print('initial_beta=%s' % initial_beta)
        print("the init performance objective function value is: " + str(err_objective_iter) + " initial_alpha=" +str(initial_alpha) + " initial_beta=" + str(initial_beta))
        '''

        performance_objective = 0 # performance object function
        
        # Step 2.2 grid search alpha, beta to get initial values starting mean_or_scale_NORMAL and stddev_or_shape_NORMAL
        alpha = mean_or_scale_NORMAL # initial value of alpha
        beta = stddev_or_shape_NORMAL # initial value of beta
        print('alpha=%s' % alpha)
        print('beta=%s' % beta)
        # initial value for performance objective function
        performance_objective = self.obj_func (alpha, beta, age_array, SP_array)
        err_objective_iter = 0
        print("the init performance objective function value is: " + str(performance_objective))
 
        for al in range(1, 100, 1):
            for be in range(1, 100, 1):
                err = self.obj_func (al, be, age_array, SP_array)
                if performance_objective > err :
                    alpha = al
                    beta = be
                    performance_objective = err
        print("Tony alpha is: " + str(alpha))
        print("Tony beta is: " + str(beta))
        print("Tony new obj_function value is: " + str(performance_objective) + " alpha=" +str(alpha) + " beta=" + str(beta))
        
        #print("Tony new obj_function (42,2) value is: " + str(self.obj_func (42, 2, age_array, SP_array)) + " when alpha=42, beta=2" )
        #print("Tony new obj_function (43,2) value is: " + str(self.obj_func (43, 2, age_array, SP_array)) + " when alpha=43, beta=2" )
        #print("Tony new obj_function (44,2) value is: " + str(self.obj_func (44, 2, age_array, SP_array)) + " when alpha=44, beta=2" )
        #alpha = 44
        #beta = 3
        #performance_objective = self.obj_func (alpha, beta, age_array, SP_array)
        
        # step 3. use gradient decent method to search optimal alpha and beta
        # initialization
        
        max_iteration = 10000
        eps = 0.001 # step length
        precision = 0.003 # stop criteria, it is not set too small to avoid overfitting
        gradient_alpha = 0
        gradient_beta = 0
        err_objective_iter = 0
        gradient_alpha_iter = 0
        gradient_beta_iter = 0
        
       
        # main part of gradient decent
        iteration = 0
        while abs(performance_objective) > precision:   
            if iteration % 100 == 0:
                print('in the while loop iteration=%s' % str(iteration))
            for i in range(0,rowcount):
                #print ("year " + str(age_array[i]))
                grad_alpha = 2 * (math.log(SP_array[i]) + math.pow((age_array[i] / alpha), beta)) * beta * math.pow((age_array[i] / alpha), beta-1) * (-age_array[i] / alpha / alpha)
                gradient_alpha_iter = gradient_alpha_iter + grad_alpha
                                
                grad_beta =  2 * (math.log(SP_array[i]) + math.pow((age_array[i] / alpha), beta)) * math.pow((age_array[i] / alpha), beta) * math.log(age_array[i] / alpha)    
                gradient_beta_iter = gradient_beta_iter + grad_beta
                
            gradient_alpha = gradient_alpha_iter
            gradient_alpha_iter = 0
            gradient_beta = gradient_beta_iter
            gradient_beta_iter = 0
            
            performance_objective_iter = 0

            al = alpha - eps * gradient_alpha # gradient decent
            bt = beta - eps * gradient_beta # gradient decent
            err_objective_iter = self.obj_func (al, bt, age_array, SP_array)
            
            if performance_objective <= err_objective_iter : # no improvement, stop loop
                break
            else:
                alpha = al 
                beta = bt     
                performance_objective = err_objective_iter
                
            if iteration % 100 == 0:
                print("the new performance objective function value is: " + str(performance_objective) + " alpha=" +str(alpha) + " beta=" + str(beta))

            iteration = iteration + 1

            if iteration == max_iteration :
                break
        print("the optimal alpha is " + str(alpha))
        print("the optimal beta is " + str(beta))
        print("the optimal performance objective function value is: " + str(performance_objective))

        mean_or_scale_WEIBULL = alpha
        stddev_or_shape_WEIBULL = beta
        print('mean_or_scale_WEIBULL=%s' % mean_or_scale_WEIBULL)
        print('stddev_or_shape_WEIBULL=%s' % stddev_or_shape_WEIBULL)
        return mean_or_scale_WEIBULL, stddev_or_shape_WEIBULL

    def generate_final_curve(self, distribution_type, mean_or_scale_final, stddev_or_shape_final):
        print('generate final degradation curve')
        #final_degradation_curve = []
        final_degradation_curve = dict()
        
        if distribution_type == 'WEIBULL':
            for age in range(0, 101, 1):
                failure_probablity_for_age = (1- math.exp(-((age / mean_or_scale_final) ** stddev_or_shape_final))) * 100  #cumulative density function of WEIBULL
                #final_degradation_curve.append([age, failure_probablity_for_age])
                final_degradation_curve[age] = failure_probablity_for_age
                
        elif distribution_type == 'NORMAL':
            for age in range(0, 101, 1):
                failure_probablity_for_age = norm(mean_or_scale_final, stddev_or_shape_final).cdf(age)  #cumulative density function of NORMAL
                #final_degradation_curve.append([(]age, failure_probablity_for_age])
                final_degradation_curve[age] = failure_probablity_for_age
        else:
            raise('distribution type is invalid')
        print('final_degradation_curve=\n%s' % final_degradation_curve)    
        # here return the list curve, to consider both old and new pipeline mode
        #return str(final_degradation_curve).strip('[]')
        return final_degradation_curve
    
    


    
    def sample_data(self):
        data =[
        ('asset01',1979,), ('asset02',1979,), ('asset03',1979,), ('asset04',1981,), ('asset05',1981,),
        ('asset06',1981,), ('asset07',1985,), ('asset08',1985,), ('asset09',1985,), ('asset10',1979,),
        ('asset11',1979,), ('asset12',1979,), ('asset13',1969,), ('asset14',1969,), ('asset15',1969,),
        ('asset16',1969,), ('asset17',1969,), ('asset18',1969,), ('asset19',1970,1996), ('asset20',1996,),
        ('asset21',1970,), ('asset22',1970,), ('asset23',1970,1989), ('asset24',1989,), ('asset25',1970,),
        ('asset26',1970,), ('asset27',1970,), ('asset28',1970,), ('asset29',1970,), ('asset30',1970,),
        ('asset31',1970,), ('asset32',1970,), ('asset33',1976,), ('asset34',1976,), ('asset35',1976,),
        ('asset36',1976,), ('asset37',1976,), ('asset38',1976,), ('asset39',1976,), ('asset40',1976,),
        ('asset41',1976,), ('asset42',1976,), ('asset43',1976,), ('asset44',1976,), ('asset45',1976,),
        ('asset46',1976,), ('asset47',1976,), ('asset48',1976,), ('asset49',1976,), ('asset50',1976,),
        ('asset51',1976,), ('asset52',1976,), ('asset53',1976,), ('asset54',1970,), ('asset55',1970,),
        ('asset56',1970,), ('asset57',1981,), ('asset58',1981,), ('asset59',1981,), ('asset60',1983,),
        ('asset61',1983,), ('asset62',1983,), ('asset63',1984,), ('asset64',1984,), ('asset65',1984,),
        ('asset66',1983,), ('asset67',1983,), ('asset68',1983,), ('asset69',1984,), ('asset70',1984,),
        ('asset71',1984,), ('asset72',1983,), ('asset73',1983,), ('asset74',1983,), ('asset75',1984,),
        ('asset76',1984,), ('asset77',1984,), ('asset78',1978,), ('asset79',1978,), ('asset80',1978,),
        ('asset81',1969,1996), ('asset82',1996,), ('asset83',1969,), ('asset84',1969,), ('asset85',1969,),
        ('asset86',1969,), ('asset87',1969,), ('asset88',1969,), ('asset89',1969,), ('asset90',1969,),
        ('asset91',1969,), ('asset92',1969,), ('asset93',1969,), ('asset94',1969,), ('asset95',1969,1997),
        ('asset96',1997,), ('asset97',1969,), ('asset98',1969,), ('asset99',1969,), ('asset100',1969,)
        ]
        df_sample = pd.DataFrame(data, columns=['assetId', self.installdate, self.decommissiondate])
        df_sample = df_sample.replace(np.nan, -1)
        #self.logger.debug('df_sample=\n%s' % df_sample)
        return df_sample

