Tests to combine results of Poisson GLM, NB GLM with dataset raw features (yearbuilt, sqft, etc) using Poisson XGB when ecy is used as a feature in GLM and XGB. Classification XGB can not be added in this configuration because it's in a different training dataset.
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_Poisson_XGB_All_f_ecy'
UseSavedIfExists = False
DataDir = '/home/kate/Research/Property/Data/'
import sys
sys.path.append('/home/kate/code/Utils/')
from MyFunctions import NormalizedWeightedGini
from MyFunctions import nLogLik_XGBoost
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
training_dataset = pd.read_csv('%sproperty_wcf_training.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_wcf_prediction.csv'%DataDir, error_bad_lines=False, index_col=False)
target_column = 'cova_ic_nc_water'
prediction_column_cv='cova_ic_nc_water_cv_pred'
prediction_column_fold = 'cova_ic_nc_water_pred'
feature_column_pglm='poissonglm_f_ecy'
feature_column_nbglm='nbglm_f_ecy'
feature_column_cxgb='class_f_ecy_xgb_fold'
featureset = [
'cova_deductible',
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'cova_limit',
'water_risk_fre_3_blk',
'water_risk_3_blk',
'ecy'
]
nrounds=5000
simon_params = {
'objective': 'count:poisson',
'eval_metric': 'poisson-nloglik',
'silent': True,
'booster': 'gbtree',
'eta': 0.01,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'max_depth': 6,
'gamma': 0,
'seed': 42}
kfold = 5
training_dataset[prediction_column_cv]=0
for i in range(0,kfold):
training_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
testing_dataset[prediction_column_cv]=0
for i in range(0,kfold):
testing_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
prediction_dataset[prediction_column_cv]=0
for i in range(0,kfold):
prediction_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
Train_Gini_l = list()
Test_Gini_l = list()
Train_LogLik_l = list()
Test_LogLik_l = list()
#-----------------------------------------------------------------------------------------------------------
#CV-folds modeling
for i in range(0,kfold):
print(' fold: {} of {} : '.format(i+1, kfold))
training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
featureset_fold = featureset + ['%s_%s'%(feature_column_pglm,i)]
#['%s_%s'%(feature_column_pglm,i)] + ['%s_%s'%(feature_column_nbglm,i)] + ['%s_%s'%(feature_column_cxgb,i)]
X_train = training_dataset_fold[featureset_fold].copy()
X_valid = validation_dataset[featureset_fold].copy()
y_train = training_dataset_fold[target_column].copy()
y_valid = validation_dataset[target_column].copy()
#preparing for XGB run
X_train = X_train.values
X_valid = X_valid.values
#
y_pred_train=pd.DataFrame(index=y_train.index)
y_pred_train[prediction_column_cv]=0
#
y_train = y_train.values
y_valid = y_valid.values
#
#
d_train = xgb.DMatrix(X_train, y_train)
#
d_valid = xgb.DMatrix(X_valid, y_valid)
#
X_pred=prediction_dataset[featureset_fold]
y_pred=prediction_dataset[target_column]
Dpred = xgb.DMatrix(X_pred.values)
#
X_test=testing_dataset[featureset_fold]
y_test=testing_dataset[target_column]
Dtest = xgb.DMatrix(X_test.values)
#
X=training_dataset[featureset_fold]
y=training_dataset[target_column]
Dtrain = xgb.DMatrix(X.values,y)
#
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#applying XGB
xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
if (os.path.exists(xgb_model_file) & UseSavedIfExists):
print('%s file exists. Reading model from the file'%xgb_model_file)
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
else:
print('%s file does not exists. Training model...'%xgb_model_file)
xgb_model = xgb.train(simon_params, d_train, nrounds, watchlist, verbose_eval=100, early_stopping_rounds=100)
pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
pred = xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50)
training_dataset[prediction_column_cv]+= pred/ (kfold)
training_dataset['%s_%s'%(prediction_column_fold,i)]= pred
pred = xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50)
testing_dataset[prediction_column_cv] += pred/(kfold)
testing_dataset['%s_%s'%(prediction_column_fold,i)] = pred
pred = xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50)
prediction_dataset[prediction_column_cv] += pred/(kfold)
prediction_dataset['%s_%s'%(prediction_column_fold,i)] = pred
#Scores cv folds
Train_Gini_l.append(NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column_cv],training_dataset['ecy']))
Test_Gini_l.append(NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column_cv],testing_dataset['ecy']))
Train_LogLik_l.append(nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column_cv]))
Test_LogLik_l.append(nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column_cv]))
#-----------------------------------------------------------------------------------------------------------
#Saving training
training_dataset.to_csv('%sproperty_wcf_training.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_wcf_prediction.csv'%DataDir,header=True,index=False)
Scores = pd.DataFrame(list(zip(Train_Gini_l,Test_Gini_l,Train_LogLik_l,Test_LogLik_l)),
columns =['Train_Gini', 'Test_Gini','Train_nLogLik', 'Test_nLogLik'])
Scores
Train Gini Test Gini 0.495222 0.439343 pure poisson xgb 0.496839 0.436388 poisson xgb + poisson GLM 0.495925 0.436334 poisson xgb + NB GLM
Adding Poisson GLM or NB GLM does not improve the result. Can not add Classification XGB to test because it's in a different training dataset. It may improve results.