ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='regression_tmp'
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import math
from sklearn.metrics import mean_absolute_error
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'mae', mean_absolute_error(preds, labels)
data = pd.read_csv('/home/kate/Research/Property/Data/EDA_log_Severity_FI_dataset.csv', error_bad_lines=False, index_col=False)
featureset = [
'stories',
'units',
'multipolicyind',
'functionalreplacementcost',
'landlordind',
'burglaryalarmtype',
'propertymanager',
'gatedcommunityind',
'replacementcostdwellingind',
'equipmentbreakdown',
'cova_deductible',
'water_risk_sev_3_blk',
'fixture_leak_3_blk',
'rep_cost_3_blk',
'sqft',
'waterded',
'constructioncd_encd',
'multipolicyindumbrella',
'usagetype_encd',
'homegardcreditind',
'rentersinsurance',
'waterdetectiondevice',
'safeguardplusind',
'deadboltind',
'replacementvalueind',
'numberoffamilies',
'water_risk_fre_3_blk',
'pipe_froze_3_blk',
'ustructure_fail_3_blk',
'customer_cnt_active_policies_binned',
'ecy',
'yearbuilt',
'roofcd_encd',
'occupancy_encd',
'protectionclass',
'fire_risk_model_score',
'earthquakeumbrellaind',
'ordinanceorlawpct',
'sprinklersystem',
'firealarmtype',
'neighborhoodcrimewatchind',
'kitchenfireextinguisherind',
'poolind',
'serviceline',
'cova_limit',
'water_risk_3_blk',
'appl_fail_3_blk',
'plumb_leak_3_blk',
'waterh_fail_3_blk'
]
target_column = 'log_cova_il_nc_water'
prediction_column = 'pred'
X=data[featureset]
y=data[target_column]
Dtrain = xgb.DMatrix(X.values,y)
nrounds = 600
esr=100
xgb_params = {
'seed': 42,
'eta': 0.01,
'colsample_bytree': 0.9,
'silent': 1,
'subsample': 0.9,
'objective': 'reg:linear',
'eval_metric':'mae',
'max_depth': 6,
'gamma': 0.4,
'min_child_weight': 4
}
xgb_model = xgb.train(xgb_params, Dtrain, nrounds, feval=evalerror)
xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
data[prediction_column]= xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50)
fmap_filename='%s/%s.fmap'%(ModelsDir,ModelName)
outfile = open(fmap_filename, 'w')
for i, feat in enumerate(featureset):
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()
#feature importance
feat_imp = pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='weight')).to_frame()
feat_imp.columns=['Weight']
feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='gain')).to_frame())
feat_imp.columns=['Weight','Gain']
feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='cover')).to_frame())
feat_imp.columns=['Weight','Gain','Cover']
#feat_imp['fold']=i
feat_imp['FeatureName'] = feat_imp.index
feat_imp['ModelName'] = ModelName
#feat_imp_all = feat_imp_all.append(feat_imp, ignore_index=True)
feat_imp.sort_values(by=['Gain'], ascending=False)
# from https://xiaoxiaowang87.github.io/monotonicity_constraint/
def partial_dependency(model, X, feature):
"""
Calculate the dependency (or partial dependency) of a response variable on a predictor (or multiple predictors)
1. Sample a grid of values of a predictor for numeric continuous or all unique values for categorical or discrete continuous.
2. For each value, replace every row of that predictor with this value, calculate the average prediction.
"""
X_temp = X.copy()
if feature in ['sqft','yearbuilt','water_risk_sev_3_blk', 'water_risk_3_blk','water_risk_fre_3_blk','ecy']:
# continuous
grid = np.linspace(np.percentile(X_temp[feature], 0.1),
np.percentile(X_temp[feature], 99.5),
50)
else:
#categorical
grid = X_temp[feature].unique()
y_pred = np.zeros(len(grid))
for i, val in enumerate(grid):
X_temp[feature] = val
d_temp=xgb.DMatrix(X_temp.values)
y_pred[i] = np.average(model.predict(d_temp,ntree_limit=model.best_ntree_limit+50))
return grid, y_pred
pd_features = ['waterded',
'cova_deductible',
'earthquakeumbrellaind',
'deadboltind',
'fire_risk_model_score',
'neighborhoodcrimewatchind',
'occupancy_encd',
'usagetype_encd',
'poolind',
'pipe_froze_3_blk',
'roofcd_encd',
'water_risk_3_blk',
'cova_limit',
'water_risk_sev_3_blk',
'safeguardplusind',
'units',
'homegardcreditind',
'water_risk_fre_3_blk',
'rentersinsurance',
'waterh_fail_3_blk',
'serviceline',
'ordinanceorlawpct',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'equipmentbreakdown',
'ecy',
'protectionclass',
'plumb_leak_3_blk',
'appl_fail_3_blk',
'customer_cnt_active_policies_binned',
'landlordind',
'sprinklersystem',
'ustructure_fail_3_blk',
'multipolicyind',
'constructioncd_encd',
'multipolicyindumbrella',
'fixture_leak_3_blk',
'replacementvalueind',
'replacementcostdwellingind',
'propertymanager',
'numberoffamilies',
'stories',
'gatedcommunityind',
'burglaryalarmtype',
'firealarmtype',
'kitchenfireextinguisherind'
]
all_fm_pd = pd.DataFrame()
for f in pd_features:
print('Processing:%s'%f)
grid, y_pred = partial_dependency(xgb_model,X,f)
fm_pd=pd.concat([pd.Series(grid), pd.Series(y_pred)], axis=1)
fm_pd.columns=['value','pd']
fm_pd['feature']=f
all_fm_pd=all_fm_pd.append(fm_pd)
all_fm_pd.to_csv('%s%s_PartialDependency.csv'%(ModelsDir,ModelName),header=True,index=False);
%matplotlib inline
for f in pd_features:
all_fm_pd[all_fm_pd['feature']==f].plot(kind='scatter',x='value', y='pd', title=f)
for f in pd_features:
print(all_fm_pd[all_fm_pd['feature']==f])