ModelsDir = '/home/kate/Research/Property/Models/'
DataDir = '/home/kate/Research/Property/Data/'
# from https://xiaoxiaowang87.github.io/monotonicity_constraint/
def partial_dependency(model, X, offset, feature):
"""
Calculate the dependency (or partial dependency) of a response variable on a predictor (or multiple predictors)
1. Sample a grid of values of a predictor for numeric continuous or all unique values for categorical or discrete continuous.
2. For each value, replace every row of that predictor with this value, calculate the average prediction.
"""
X_temp = X.copy()
if feature in ['water_risk_fre_3_blk','water_risk_3_blk','ecy']:
# continuous
grid = np.linspace(np.percentile(X_temp[feature], 0.1),
np.percentile(X_temp[feature], 99.5),
50)
else:
#categorical
grid = X_temp[feature].unique()
y_pred = np.zeros(len(grid))
for i, val in enumerate(grid):
X_temp[feature] = val
d_temp=xgb.DMatrix(X_temp.values)
d_temp.set_base_margin(offset.values)
y_pred[i] = np.average(model.predict(d_temp,ntree_limit=model.best_ntree_limit+50))
return grid, y_pred
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
kfold = 5
training_dataset = pd.read_csv('%sproperty_wcf_class_training.csv'%DataDir, error_bad_lines=False, index_col=False)
featureset = [
'cova_deductible',
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'cova_limit',
'water_risk_fre_3_blk',
'water_risk_3_blk'
]
pd_featureset = [
'cova_deductible',
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'cova_limit',
'water_risk_fre_3_blk',
'water_risk_3_blk'
]
X = training_dataset.loc[:,featureset]
offset = training_dataset['log_ecy']
Model='wc_class_bm_ecy_XGB'
all_fm_pd = pd.DataFrame()
for i in range(0,kfold):
ModelName='%s_%s'%(Model,i)
print('Processing Model: %s'%ModelName)
xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
for f in pd_featureset:
print('Processing:%s'%f)
grid, y_pred = partial_dependency(xgb_model,X,offset,f)
fm_pd=pd.concat([pd.Series(grid), pd.Series(y_pred)], axis=1)
fm_pd.columns=['value','pd']
fm_pd['feature']=f
fm_pd['fold']=i
fm_pd['ModelName']=Model
all_fm_pd=all_fm_pd.append(fm_pd)
all_fm_pd.to_csv('%s%sPartialDependency.csv'%(ModelsDir,Model),header=True,index=False);
training_dataset = pd.read_csv('%sproperty_wcf_training.csv'%DataDir, error_bad_lines=False, index_col=False)
X = training_dataset.loc[:,featureset]
offset = training_dataset['log_ecy']
Model='wc_Poisson_bm_ecy_XGB'
all_fm_pd = pd.DataFrame()
for i in range(0,kfold):
ModelName='%s_%s'%(Model,i)
print('Processing Model: %s'%ModelName)
xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
for f in pd_featureset:
print('Processing:%s'%f)
grid, y_pred = partial_dependency(xgb_model,X,offset,f)
fm_pd=pd.concat([pd.Series(grid), pd.Series(y_pred)], axis=1)
fm_pd.columns=['value','pd']
fm_pd['feature']=f
fm_pd['fold']=i
fm_pd['ModelName']=Model
all_fm_pd=all_fm_pd.append(fm_pd)
all_fm_pd.to_csv('%s%sPartialDependency.csv'%(ModelsDir,Model),header=True,index=False);