In [1]:
ModelsDir = '/home/kate/Research/Property/Models/'
In [2]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
import xgbfir
In [3]:
Models = ['basemodel0_class_XGB'
          #'basemodel1_class_XGB',
          #'wc_Poisson_bm_ecy_XGB',
          #'wc_Poisson_f_ecy_XGB',
          #'wc_class_f_ecy_XGB',
          #'wc_class_bm_ecy_XGB',
          #'wc_Linear_Reg_XGB_mae',
          #'wc_LogRegObj_Reg_XGB_mae',
          #'wc_Gamma_Reg_XGB_mae'
         ]
In [4]:
featureset_basemodel0  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind'  
]
In [5]:
featureset_basemodel1  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',
'cova_deductible',
'cova_limit'
]
In [6]:
featureset_bm  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',    
'cova_deductible',
'cova_limit'
]
In [7]:
featureset_f  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind',    
'cova_deductible',
'cova_limit',    
'ecy'
]
In [8]:
featureset_s=[
'cova_deductible',
'roofcd_encd',
'water_risk_sev_3_blk',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'ecy',
'usagetype_encd'
]
In [9]:
kfold = 1
In [10]:
def create_fmap(ModelName,featureset):
    fmap_filename='%s%s.fmap'%(ModelsDir,ModelName)
    outfile = open(fmap_filename, 'w')
    for i, feat in enumerate(featureset):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()
    return fmap_filename
In [11]:
feat_imp_all = pd.DataFrame()
for Model in Models:
    for i in range(0,kfold):
        ModelName=Model+"_%s"%i
        xgb_model_file='%s%s.model'%(ModelsDir,ModelName)
        print('Processing model %s, fold %s...'%(Model,i))
        xgb_model = pickle.load(open(xgb_model_file, 'rb'))
        #feature importance
        if '_bm_'  in Model:
            featureset=featureset_bm
        elif 'basemodel0'  in Model:
            featureset=featureset_basemodel0
        elif 'basemodel1'  in Model: 
            featureset=featureset_basemodel1            
        elif '_Reg_'  in Model:
            featureset=featureset_s            
        else:
            featureset=featureset_f
        fmap_filename=create_fmap(ModelName,featureset)
        feat_imp = pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='weight')).to_frame()
        feat_imp.columns=['Weight']
        feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='gain')).to_frame())
        feat_imp.columns=['Weight','Gain']
        feat_imp = feat_imp.join(pd.Series(xgb_model.get_score(fmap=fmap_filename,importance_type='cover')).to_frame())
        feat_imp.columns=['Weight','Gain','Cover']
        feat_imp['fold']=i
        feat_imp['FeatureName'] = feat_imp.index
        feat_imp['ModelName'] = Model
        feat_imp_all = feat_imp_all.append(feat_imp, ignore_index=True)
        #iterations
        xgbfir.saveXgbFI(xgb_model, feature_names=featureset,  TopK = 500,  MaxTrees = 500, MaxInteractionDepth = 2, OutputXlsxFile = '%s%s_%s.xlsx'%(ModelsDir,Model,i))
feat_imp_all = feat_imp_all[['ModelName','fold','FeatureName','Weight','Gain','Cover']]
feat_imp_all.to_csv('%swc_feature_importance.csv'%ModelsDir,header=True,index=False)
Processing model basemodel0_class_XGB, fold 0...