In [1]:
ModelsDir = '/home/kate/Research/Property/Models/'
DataDir = '/home/kate/Research/Property/Data/'
In [2]:
# from https://xiaoxiaowang87.github.io/monotonicity_constraint/
def partial_dependency(model, X,  feature):

    """
    Calculate the dependency (or partial dependency) of a response variable on a predictor (or multiple predictors)
    1. Sample a grid of values of a predictor for numeric continuous or all unique values for categorical or discrete continuous.
    2. For each value, replace every row of that predictor with this value, calculate the average prediction.
    """

    X_temp = X.copy()
    

    if feature in ['water_risk_fre_3_blk','water_risk_3_blk','ecy']:
        # continuous
        grid = np.linspace(np.percentile(X_temp[feature], 0.1),
                       np.percentile(X_temp[feature], 99.5),
                           50)
    else:
        #categorical
        grid = X_temp[feature].unique()

    y_pred = np.zeros(len(grid))

    for i, val in enumerate(grid):
        X_temp[feature] = val
        d_temp=xgb.DMatrix(X_temp.values)
        #d_temp.set_base_margin(offset.values)
        y_pred[i] = np.average(model.predict(d_temp,ntree_limit=model.best_ntree_limit+50))


    return grid, y_pred
In [3]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
In [4]:
kfold = 5
In [5]:
training_dataset = pd.read_csv('%sproperty_wcf_class_training_basemodel0.csv'%DataDir, error_bad_lines=False, index_col=False)
In [6]:
featureset  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'poolind',
'multipolicyind'
]
In [7]:
pd_featureset  = [
'poolind'
]
In [8]:
X = training_dataset.loc[:,featureset]
In [9]:
Model='basemodel0_class_XGB'
all_fm_pd = pd.DataFrame()
for i in range(0,kfold):
    ModelName='%s_%s'%(Model,i)
    print('Processing Model: %s'%ModelName)
    xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
    xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    for f in pd_featureset:
        print('Processing:%s'%f)
        grid, y_pred = partial_dependency(xgb_model,X,f)
        fm_pd=pd.concat([pd.Series(grid), pd.Series(y_pred)], axis=1)
        fm_pd.columns=['value','pd']
        fm_pd['feature']=f
        fm_pd['fold']=i
        fm_pd['ModelName']=Model
        all_fm_pd=all_fm_pd.append(fm_pd)
        all_fm_pd.to_csv('%s%sPartialDependency.csv'%(ModelsDir,Model),header=True,index=False);
Processing Model: basemodel0_class_XGB_0
Processing:poolind
Processing Model: basemodel0_class_XGB_1
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-9-1fe1be02e06b> in <module>()
      5     print('Processing Model: %s'%ModelName)
      6     xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
----> 7     xgb_model = pickle.load(open(xgb_model_file, 'rb'))
      8     for f in pd_featureset:
      9         print('Processing:%s'%f)

FileNotFoundError: [Errno 2] No such file or directory: '/home/kate/Research/Property/Models/basemodel0_class_XGB_1.model'
In [10]:
all_fm_pd
Out[10]:
value pd feature fold ModelName
0 0 0.011368 poolind 0 basemodel0_class_XGB
1 1 0.009884 poolind 0 basemodel0_class_XGB
In [ ]:
training_dataset = pd.read_csv('%sproperty_wcf_training.csv'%DataDir, error_bad_lines=False, index_col=False)
In [ ]:
X = training_dataset.loc[:,featureset]
In [ ]:
Model='wc_Poisson_f_ecy_XGB'
all_fm_pd = pd.DataFrame()
for i in range(0,kfold):
    ModelName='%s_%s'%(Model,i)
    print('Processing Model: %s'%ModelName)
    xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
    xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    for f in pd_featureset:
        print('Processing:%s'%f)
        grid, y_pred = partial_dependency(xgb_model,X,f)
        fm_pd=pd.concat([pd.Series(grid), pd.Series(y_pred)], axis=1)
        fm_pd.columns=['value','pd']
        fm_pd['feature']=f
        fm_pd['fold']=i
        fm_pd['ModelName']=Model
        all_fm_pd=all_fm_pd.append(fm_pd)
        all_fm_pd.to_csv('%s%sPartialDependency.csv'%(ModelsDir,Model),header=True,index=False);