Tests to combine results of Poisson GLM, NB GLM with dataset raw features (yearbuilt, sqft, etc) using Poisson XGB when ecy is used as a feature in GLM and XGB. Classification XGB can not be added in this configuration because it's in a different training dataset.

In [1]:
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_Poisson_XGB_All_f_ecy'
UseSavedIfExists = False
DataDir = '/home/kate/Research/Property/Data/'
In [2]:
import sys

sys.path.append('/home/kate/code/Utils/')

from MyFunctions import NormalizedWeightedGini
from MyFunctions import nLogLik_XGBoost
In [3]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
In [4]:
training_dataset = pd.read_csv('%sproperty_wcf_training.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_wcf_prediction.csv'%DataDir, error_bad_lines=False, index_col=False)
In [5]:
target_column = 'cova_ic_nc_water'
prediction_column_cv='cova_ic_nc_water_cv_pred'
prediction_column_fold = 'cova_ic_nc_water_pred'
feature_column_pglm='poissonglm_f_ecy'
feature_column_nbglm='nbglm_f_ecy'
feature_column_cxgb='class_f_ecy_xgb_fold'
In [6]:
featureset  = [
 'cova_deductible',
 'roofcd_encd',
 'sqft',
 'usagetype_encd',
 'yearbuilt',
 'cova_limit',
 'water_risk_fre_3_blk',
 'water_risk_3_blk',
 'ecy'
]
In [7]:
nrounds=5000
simon_params = {
        'objective': 'count:poisson',
        'eval_metric': 'poisson-nloglik',
        'silent': True,
        'booster': 'gbtree',
        'eta': 0.01, 
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'max_depth': 6,
        'gamma': 0,
        'seed': 42}
In [11]:
kfold = 5
In [12]:
training_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    training_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
testing_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    testing_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
prediction_dataset[prediction_column_cv]=0
for i in range(0,kfold):
    prediction_dataset['%s_%s'%(prediction_column_fold,i)]=0
#
Train_Gini_l = list()
Test_Gini_l = list()
Train_LogLik_l = list()
Test_LogLik_l = list()

#-----------------------------------------------------------------------------------------------------------       
#CV-folds modeling
for i in range(0,kfold):
    print(' fold: {}  of  {} : '.format(i+1, kfold))
    training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
    validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
        
    featureset_fold =  featureset +  ['%s_%s'%(feature_column_pglm,i)]
        #['%s_%s'%(feature_column_pglm,i)] + ['%s_%s'%(feature_column_nbglm,i)] +  ['%s_%s'%(feature_column_cxgb,i)]
    X_train =  training_dataset_fold[featureset_fold].copy()
    X_valid =  validation_dataset[featureset_fold].copy()        
    y_train =  training_dataset_fold[target_column].copy()
    y_valid =  validation_dataset[target_column].copy()       
             
  
    #preparing for XGB run
    X_train = X_train.values
    X_valid = X_valid.values
    #
    y_pred_train=pd.DataFrame(index=y_train.index)
    y_pred_train[prediction_column_cv]=0
    #
    y_train = y_train.values
    y_valid = y_valid.values
    #

    #
    d_train = xgb.DMatrix(X_train, y_train)
    #
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    #
    X_pred=prediction_dataset[featureset_fold]
    y_pred=prediction_dataset[target_column]
    Dpred = xgb.DMatrix(X_pred.values)    
    #
    X_test=testing_dataset[featureset_fold]
    y_test=testing_dataset[target_column]
    Dtest = xgb.DMatrix(X_test.values)
    #
    X=training_dataset[featureset_fold]
    y=training_dataset[target_column]
    Dtrain = xgb.DMatrix(X.values,y)

    #
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    #applying XGB
    xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
    if (os.path.exists(xgb_model_file) & UseSavedIfExists):
        print('%s file exists. Reading model from the file'%xgb_model_file)
        xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    else:
        print('%s file does not exists. Training model...'%xgb_model_file)
        xgb_model = xgb.train(simon_params, d_train, nrounds, watchlist,   verbose_eval=100, early_stopping_rounds=100)
        pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
            
    pred = xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50)
    training_dataset[prediction_column_cv]+=   pred/ (kfold)
    training_dataset['%s_%s'%(prediction_column_fold,i)]=  pred
        
    pred = xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50)
    testing_dataset[prediction_column_cv] +=   pred/(kfold)  
    testing_dataset['%s_%s'%(prediction_column_fold,i)] =  pred 
    
    pred = xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50)
    prediction_dataset[prediction_column_cv] +=   pred/(kfold)  
    prediction_dataset['%s_%s'%(prediction_column_fold,i)] =  pred   
    
#Scores cv folds
Train_Gini_l.append(NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column_cv],training_dataset['ecy']))
Test_Gini_l.append(NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column_cv],testing_dataset['ecy']))
Train_LogLik_l.append(nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column_cv]))
Test_LogLik_l.append(nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column_cv]))
#-----------------------------------------------------------------------------------------------------------
#Saving training
training_dataset.to_csv('%sproperty_wcf_training.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_wcf_prediction.csv'%DataDir,header=True,index=False)
 fold: 1  of  5 : 
/home/kate/Research/Property/Models/wc_Poisson_XGB_All_f_ecy_0.model file does not exists. Training model...
[0]	train-poisson-nloglik:0.506691	valid-poisson-nloglik:0.506681
Multiple eval metrics have been passed: 'valid-poisson-nloglik' will be used for early stopping.

Will train until valid-poisson-nloglik hasn't improved in 100 rounds.
[100]	train-poisson-nloglik:0.32289	valid-poisson-nloglik:0.322872
[200]	train-poisson-nloglik:0.213154	valid-poisson-nloglik:0.213142
[300]	train-poisson-nloglik:0.1483	valid-poisson-nloglik:0.148305
[400]	train-poisson-nloglik:0.110522	valid-poisson-nloglik:0.110557
[500]	train-poisson-nloglik:0.088929	valid-poisson-nloglik:0.089007
[600]	train-poisson-nloglik:0.076871	valid-poisson-nloglik:0.077008
[700]	train-poisson-nloglik:0.07027	valid-poisson-nloglik:0.070476
[800]	train-poisson-nloglik:0.066748	valid-poisson-nloglik:0.067044
[900]	train-poisson-nloglik:0.064872	valid-poisson-nloglik:0.065265
[1000]	train-poisson-nloglik:0.063859	valid-poisson-nloglik:0.064362
[1100]	train-poisson-nloglik:0.063288	valid-poisson-nloglik:0.063913
[1200]	train-poisson-nloglik:0.062943	valid-poisson-nloglik:0.063689
[1300]	train-poisson-nloglik:0.062715	valid-poisson-nloglik:0.063569
[1400]	train-poisson-nloglik:0.062543	valid-poisson-nloglik:0.063508
[1500]	train-poisson-nloglik:0.062405	valid-poisson-nloglik:0.063471
[1600]	train-poisson-nloglik:0.062284	valid-poisson-nloglik:0.063452
[1700]	train-poisson-nloglik:0.062172	valid-poisson-nloglik:0.063438
[1800]	train-poisson-nloglik:0.062055	valid-poisson-nloglik:0.063428
[1900]	train-poisson-nloglik:0.061947	valid-poisson-nloglik:0.063424
[2000]	train-poisson-nloglik:0.061821	valid-poisson-nloglik:0.063419
[2100]	train-poisson-nloglik:0.061706	valid-poisson-nloglik:0.063414
[2200]	train-poisson-nloglik:0.061592	valid-poisson-nloglik:0.063413
[2300]	train-poisson-nloglik:0.061481	valid-poisson-nloglik:0.063412
[2400]	train-poisson-nloglik:0.061368	valid-poisson-nloglik:0.063411
Stopping. Best iteration:
[2304]	train-poisson-nloglik:0.061476	valid-poisson-nloglik:0.06341

 fold: 2  of  5 : 
/home/kate/Research/Property/Models/wc_Poisson_XGB_All_f_ecy_1.model file does not exists. Training model...
[0]	train-poisson-nloglik:0.50669	valid-poisson-nloglik:0.506686
Multiple eval metrics have been passed: 'valid-poisson-nloglik' will be used for early stopping.

Will train until valid-poisson-nloglik hasn't improved in 100 rounds.
[100]	train-poisson-nloglik:0.322885	valid-poisson-nloglik:0.322892
[200]	train-poisson-nloglik:0.213136	valid-poisson-nloglik:0.213166
[300]	train-poisson-nloglik:0.14828	valid-poisson-nloglik:0.148343
[400]	train-poisson-nloglik:0.110496	valid-poisson-nloglik:0.110609
[500]	train-poisson-nloglik:0.088905	valid-poisson-nloglik:0.089081
[600]	train-poisson-nloglik:0.076837	valid-poisson-nloglik:0.077097
[700]	train-poisson-nloglik:0.07024	valid-poisson-nloglik:0.070601
[800]	train-poisson-nloglik:0.06671	valid-poisson-nloglik:0.06719
[900]	train-poisson-nloglik:0.06483	valid-poisson-nloglik:0.06544
[1000]	train-poisson-nloglik:0.063811	valid-poisson-nloglik:0.064564
[1100]	train-poisson-nloglik:0.063238	valid-poisson-nloglik:0.064133
[1200]	train-poisson-nloglik:0.062896	valid-poisson-nloglik:0.063929
[1300]	train-poisson-nloglik:0.062672	valid-poisson-nloglik:0.06383
[1400]	train-poisson-nloglik:0.062502	valid-poisson-nloglik:0.063782
[1500]	train-poisson-nloglik:0.062367	valid-poisson-nloglik:0.063759
[1600]	train-poisson-nloglik:0.062247	valid-poisson-nloglik:0.06375
[1700]	train-poisson-nloglik:0.062136	valid-poisson-nloglik:0.063744
[1800]	train-poisson-nloglik:0.062025	valid-poisson-nloglik:0.063737
[1900]	train-poisson-nloglik:0.06191	valid-poisson-nloglik:0.063735
Stopping. Best iteration:
[1869]	train-poisson-nloglik:0.061947	valid-poisson-nloglik:0.063732

 fold: 3  of  5 : 
/home/kate/Research/Property/Models/wc_Poisson_XGB_All_f_ecy_2.model file does not exists. Training model...
[0]	train-poisson-nloglik:0.506691	valid-poisson-nloglik:0.506686
Multiple eval metrics have been passed: 'valid-poisson-nloglik' will be used for early stopping.

Will train until valid-poisson-nloglik hasn't improved in 100 rounds.
[100]	train-poisson-nloglik:0.322894	valid-poisson-nloglik:0.322903
[200]	train-poisson-nloglik:0.213147	valid-poisson-nloglik:0.213177
[300]	train-poisson-nloglik:0.148289	valid-poisson-nloglik:0.14835
[400]	train-poisson-nloglik:0.110507	valid-poisson-nloglik:0.110613
[500]	train-poisson-nloglik:0.088916	valid-poisson-nloglik:0.089082
[600]	train-poisson-nloglik:0.076846	valid-poisson-nloglik:0.077084
[700]	train-poisson-nloglik:0.070242	valid-poisson-nloglik:0.070573
[800]	train-poisson-nloglik:0.066715	valid-poisson-nloglik:0.06715
[900]	train-poisson-nloglik:0.064835	valid-poisson-nloglik:0.065391
[1000]	train-poisson-nloglik:0.063823	valid-poisson-nloglik:0.064508
[1100]	train-poisson-nloglik:0.063253	valid-poisson-nloglik:0.064073
[1200]	train-poisson-nloglik:0.062906	valid-poisson-nloglik:0.063859
[1300]	train-poisson-nloglik:0.062676	valid-poisson-nloglik:0.063753
[1400]	train-poisson-nloglik:0.062502	valid-poisson-nloglik:0.0637
[1500]	train-poisson-nloglik:0.062364	valid-poisson-nloglik:0.063676
[1600]	train-poisson-nloglik:0.062239	valid-poisson-nloglik:0.063659
[1700]	train-poisson-nloglik:0.062117	valid-poisson-nloglik:0.063646
[1800]	train-poisson-nloglik:0.061995	valid-poisson-nloglik:0.06364
[1900]	train-poisson-nloglik:0.061866	valid-poisson-nloglik:0.063636
Stopping. Best iteration:
[1870]	train-poisson-nloglik:0.061904	valid-poisson-nloglik:0.063635

 fold: 4  of  5 : 
/home/kate/Research/Property/Models/wc_Poisson_XGB_All_f_ecy_3.model file does not exists. Training model...
[0]	train-poisson-nloglik:0.506688	valid-poisson-nloglik:0.506698
Multiple eval metrics have been passed: 'valid-poisson-nloglik' will be used for early stopping.

Will train until valid-poisson-nloglik hasn't improved in 100 rounds.
[100]	train-poisson-nloglik:0.322883	valid-poisson-nloglik:0.322916
[200]	train-poisson-nloglik:0.213136	valid-poisson-nloglik:0.213201
[300]	train-poisson-nloglik:0.148276	valid-poisson-nloglik:0.148384
[400]	train-poisson-nloglik:0.11049	valid-poisson-nloglik:0.110659
[500]	train-poisson-nloglik:0.088888	valid-poisson-nloglik:0.089139
[600]	train-poisson-nloglik:0.076816	valid-poisson-nloglik:0.077164
[700]	train-poisson-nloglik:0.070212	valid-poisson-nloglik:0.070666
[800]	train-poisson-nloglik:0.066685	valid-poisson-nloglik:0.06726
[900]	train-poisson-nloglik:0.064802	valid-poisson-nloglik:0.065511
[1000]	train-poisson-nloglik:0.063786	valid-poisson-nloglik:0.064628
[1100]	train-poisson-nloglik:0.063208	valid-poisson-nloglik:0.064187
[1200]	train-poisson-nloglik:0.062862	valid-poisson-nloglik:0.063974
[1300]	train-poisson-nloglik:0.062636	valid-poisson-nloglik:0.063864
[1400]	train-poisson-nloglik:0.062466	valid-poisson-nloglik:0.063811
[1500]	train-poisson-nloglik:0.062333	valid-poisson-nloglik:0.063781
[1600]	train-poisson-nloglik:0.06221	valid-poisson-nloglik:0.063764
[1700]	train-poisson-nloglik:0.062089	valid-poisson-nloglik:0.063749
[1800]	train-poisson-nloglik:0.061978	valid-poisson-nloglik:0.063743
[1900]	train-poisson-nloglik:0.061861	valid-poisson-nloglik:0.063735
[2000]	train-poisson-nloglik:0.061738	valid-poisson-nloglik:0.063729
[2100]	train-poisson-nloglik:0.061627	valid-poisson-nloglik:0.063729
Stopping. Best iteration:
[2045]	train-poisson-nloglik:0.061689	valid-poisson-nloglik:0.063727

 fold: 5  of  5 : 
/home/kate/Research/Property/Models/wc_Poisson_XGB_All_f_ecy_4.model file does not exists. Training model...
[0]	train-poisson-nloglik:0.506688	valid-poisson-nloglik:0.506698
Multiple eval metrics have been passed: 'valid-poisson-nloglik' will be used for early stopping.

Will train until valid-poisson-nloglik hasn't improved in 100 rounds.
[100]	train-poisson-nloglik:0.322883	valid-poisson-nloglik:0.322927
[200]	train-poisson-nloglik:0.213137	valid-poisson-nloglik:0.213221
[300]	train-poisson-nloglik:0.148275	valid-poisson-nloglik:0.148412
[400]	train-poisson-nloglik:0.11049	valid-poisson-nloglik:0.110692
[500]	train-poisson-nloglik:0.088886	valid-poisson-nloglik:0.089174
[600]	train-poisson-nloglik:0.076811	valid-poisson-nloglik:0.077201
[700]	train-poisson-nloglik:0.0702	valid-poisson-nloglik:0.070713
[800]	train-poisson-nloglik:0.066665	valid-poisson-nloglik:0.067311
[900]	train-poisson-nloglik:0.064783	valid-poisson-nloglik:0.065568
[1000]	train-poisson-nloglik:0.063768	valid-poisson-nloglik:0.064696
[1100]	train-poisson-nloglik:0.063195	valid-poisson-nloglik:0.064262
[1200]	train-poisson-nloglik:0.062856	valid-poisson-nloglik:0.064049
[1300]	train-poisson-nloglik:0.062628	valid-poisson-nloglik:0.063936
[1400]	train-poisson-nloglik:0.062453	valid-poisson-nloglik:0.063877
[1500]	train-poisson-nloglik:0.062314	valid-poisson-nloglik:0.063843
[1600]	train-poisson-nloglik:0.062191	valid-poisson-nloglik:0.063819
[1700]	train-poisson-nloglik:0.062069	valid-poisson-nloglik:0.063804
[1800]	train-poisson-nloglik:0.061948	valid-poisson-nloglik:0.063797
[1900]	train-poisson-nloglik:0.061835	valid-poisson-nloglik:0.063791
Stopping. Best iteration:
[1862]	train-poisson-nloglik:0.061878	valid-poisson-nloglik:0.063791

In [13]:
Scores = pd.DataFrame(list(zip(Train_Gini_l,Test_Gini_l,Train_LogLik_l,Test_LogLik_l)), 
               columns =['Train_Gini', 'Test_Gini','Train_nLogLik', 'Test_nLogLik']) 
Scores
Out[13]:
Train_Gini Test_Gini Train_nLogLik Test_nLogLik
0 0.496839 0.436388 0.061763 0.039218

Train Gini Test Gini 0.495222 0.439343 pure poisson xgb 0.496839 0.436388 poisson xgb + poisson GLM 0.495925 0.436334 poisson xgb + NB GLM

Adding Poisson GLM or NB GLM does not improve the result. Can not add Classification XGB to test because it's in a different training dataset. It may improve results.