ModelsDir = '/home/kate/Research/Property/Models/'
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
import xgbfir
Models = ['basemodel0_class_XGB'
#'basemodel1_class_XGB',
#'wc_Poisson_bm_ecy_XGB',
#'wc_Poisson_f_ecy_XGB',
#'wc_class_f_ecy_XGB',
#'wc_class_bm_ecy_XGB',
#'wc_Linear_Reg_XGB_mae',
#'wc_LogRegObj_Reg_XGB_mae',
#'wc_Gamma_Reg_XGB_mae'
]
featureset_basemodel0 = [
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind'
]
featureset_basemodel1 = [
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',
'cova_deductible',
'cova_limit'
]
featureset_bm = [
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',
'cova_deductible',
'cova_limit'
]
featureset_f = [
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',
'cova_deductible',
'cova_limit',
'ecy'
]
featureset_s=[
'cova_deductible',
'roofcd_encd',
'water_risk_sev_3_blk',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'ecy',
'usagetype_encd'
]
kfold = 1
def create_fmap(ModelName,featureset):
fmap_filename='%s%s.fmap'%(ModelsDir,ModelName)
outfile = open(fmap_filename, 'w')
for i, feat in enumerate(featureset):
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()
return fmap_filename
feat_imp_all = pd.DataFrame()
for Model in Models:
for i in range(0,kfold):
ModelName=Model+"_%s"%i
xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
print('Processing model %s, fold %s...'%(Model,i))
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
#feature importance
if '_bm_' in Model:
featureset=featureset_bm
elif 'basemodel0' in Model:
featureset=featureset_basemodel0
elif 'basemodel1' in Model:
featureset=featureset_basemodel1
elif '_Reg_' in Model:
featureset=featureset_s
else:
featureset=featureset_f
fmap_filename=create_fmap(ModelName,featureset)
feat_imp = pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='weight')).to_frame()
feat_imp.columns=['Weight']
feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='gain')).to_frame())
feat_imp.columns=['Weight','Gain']
feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='cover')).to_frame())
feat_imp.columns=['Weight','Gain','Cover']
feat_imp['fold']=i
feat_imp['FeatureName'] = feat_imp.index
feat_imp['ModelName'] = Model
feat_imp_all = feat_imp_all.append(feat_imp, ignore_index=True)
#iterations
xgbfir.saveXgbFI(xgb_model, feature_names=featureset, TopK = 500, MaxTrees = 500, MaxInteractionDepth = 2, OutputXlsxFile = '%s%s_%s.xlsx'%(ModelsDir,Model,i))
feat_imp_all = feat_imp_all[['ModelName','fold','FeatureName','Weight','Gain','Cover']]
feat_imp_all.to_csv('%swc_feature_importance.csv'%ModelsDir,header=True,index=False)