Tests to combine results of Poisson GLM, NB GLM with dataset raw features (yearbuilt, sqft, etc) using Poisson XGB when ecy is used as an offset in GLM or base margin in XGB. Classification XGB can not be added in this configuration because it's in a different training dataset.
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_Poisson_XGB_All_bm_ecy'
UseSavedIfExists = False
DataDir = '/home/kate/Research/Property/Data/'
import sys
sys.path.append('/home/kate/code/Utils/')
from MyFunctions import NormalizedWeightedGini
from MyFunctions import nLogLik_XGBoost
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
training_dataset = pd.read_csv('%sproperty_wcf_training.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_wcf_prediction.csv'%DataDir, error_bad_lines=False, index_col=False)
target_column = 'cova_ic_nc_water'
offset_column='log_ecy'
prediction_column_cv='cova_ic_nc_water_cv_pred'
prediction_column_fold='cova_ic_nc_water_pred'
feature_column_pglm='poissonglm_ofst_ecy'
feature_column_nbglm='nbglm_ofst_ecy'
feature_column_cxgb='class_bm_ecy_xgb_fold'
featureset = [
'cova_deductible',
'roofcd_encd',
'sqft',
'usagetype_encd',
'yearbuilt',
'cova_limit',
'water_risk_fre_3_blk',
'water_risk_3_blk',
'ecy'
]
nrounds=5000
simon_params = {
'objective': 'count:poisson',
'eval_metric': 'poisson-nloglik',
'silent': True,
'booster': 'gbtree',
'eta': 0.01,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'max_depth': 6,
'gamma': 0,
'seed': 42}
kfold = 5
training_dataset[prediction_column_cv]=0
for i in range(0,kfold):
training_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
testing_dataset[prediction_column_cv]=0
for i in range(0,kfold):
testing_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
prediction_dataset[prediction_column_cv]=0
for i in range(0,kfold):
prediction_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
Train_Gini_l = list()
Test_Gini_l = list()
Train_LogLik_l = list()
Test_LogLik_l = list()
#-----------------------------------------------------------------------------------------------------------
#CV-folds modeling
for i in range(0,kfold):
print(' fold: {} of {} : '.format(i+1, kfold))
training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
featureset_fold = featureset + ['%s_%s'%(feature_column_nbglm,i)]
#['%s_%s'%(feature_column_pglm,i)] + ['%s_%s'%(feature_column_nbglm,i)] + ['%s_%s'%(feature_column_cxgb,i)]
X_train = training_dataset_fold[featureset_fold].copy()
X_valid = validation_dataset[featureset_fold].copy()
y_train = training_dataset_fold[target_column].copy()
y_valid = validation_dataset[target_column].copy()
offset_train = training_dataset_fold[offset_column].copy()
offset_valid = validation_dataset[offset_column].copy()
#preparing for XGB run
X_train = X_train.values
X_valid = X_valid.values
#
y_pred_train=pd.DataFrame(index=y_train.index)
y_pred_train[prediction_column_cv]=0
#
y_train = y_train.values
y_valid = y_valid.values
#
offset_train = offset_train.values
offset_valid = offset_valid.values
#
d_train = xgb.DMatrix(X_train, y_train)
d_train.set_base_margin(offset_train)
#
d_valid = xgb.DMatrix(X_valid, y_valid)
d_valid.set_base_margin(offset_valid)
#
X_pred=prediction_dataset[featureset_fold]
y_pred=prediction_dataset[target_column]
offset_pred=prediction_dataset[offset_column]
Dpred = xgb.DMatrix(X_pred.values)
Dpred.set_base_margin(offset_pred.values)
#
X_test=testing_dataset[featureset_fold]
y_test=testing_dataset[target_column]
offset_test=testing_dataset[offset_column]
Dtest = xgb.DMatrix(X_test.values)
Dtest.set_base_margin(offset_test.values)
#
X=training_dataset[featureset_fold]
y=training_dataset[target_column]
offset=training_dataset[offset_column]
Dtrain = xgb.DMatrix(X.values,y)
Dtrain.set_base_margin(offset.values)
#
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#applying XGB
xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
if (os.path.exists(xgb_model_file) & UseSavedIfExists):
print('%s file exists. Reading model from the file'%xgb_model_file)
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
else:
print('%s file does not exists. Training model...'%xgb_model_file)
xgb_model = xgb.train(simon_params, d_train, nrounds, watchlist, verbose_eval=100, early_stopping_rounds=100)
pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
pred = xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50)
training_dataset[prediction_column_cv]+= pred/ (kfold)
training_dataset['%s_%s'%(prediction_column_fold,i)]= pred
pred = xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50)
testing_dataset[prediction_column_cv] += pred/(kfold)
testing_dataset['%s_%s'%(prediction_column_fold,i)] = pred
pred = xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50)
prediction_dataset[prediction_column_cv] += pred/(kfold)
prediction_dataset['%s_%s'%(prediction_column_fold,i)] = pred
#Scores cv folds
Train_Gini_l.append(NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column_cv],training_dataset['ecy']))
Test_Gini_l.append(NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column_cv],testing_dataset['ecy']))
Train_LogLik_l.append(nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column_cv]))
Test_LogLik_l.append(nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column_cv]))
#-----------------------------------------------------------------------------------------------------------
#Saving training
training_dataset.to_csv('%sproperty_wcf_training.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_wcf_prediction.csv'%DataDir,header=True,index=False)
Scores = pd.DataFrame(list(zip(Train_Gini_l,Test_Gini_l,Train_LogLik_l,Test_LogLik_l)),
columns =['Train_Gini', 'Test_Gini','Train_nLogLik', 'Test_nLogLik'])
Scores
Train Gini Test Gini 0.488423 0.441962 pure poisson xgb 0.5003 0.440047 poisson xgb + poisson GLM 0.501749 0.440039 poisson xgb + NB GLM
Adding Poisson GLM or NB GLM does not improve the result but makes higher overfitting. Can not add Classification XGB to test because it's in a different training dataset. It may improve results.