In [1]:
ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_LogRegObj_Reg_XGB_mae'
UseSavedIfExists = True
DataDir = '/home/kate/Research/Property/Data/'
In [2]:
import sys

sys.path.append('/home/kate/code/Utils/')

from MyFunctions import NormalizedWeightedGini
from MyFunctions import mae
from MyFunctions import rmse
In [3]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
In [4]:
training_dataset = pd.read_csv('%sproperty_wcs_training_for_normal.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir, error_bad_lines=False, index_col=False)
In [5]:
target_column = 'log_cova_il_nc_water'
prediction_column_cv='LogRegObj_reg_xgb_mae'
In [6]:
features=[
'cova_deductible',
'roofcd_encd',
'water_risk_sev_3_blk',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'ecy',
'usagetype_encd'
]
In [7]:
X=training_dataset[features]
y=training_dataset[target_column]
Dtrain = xgb.DMatrix(X.values,y)
#
X_test=testing_dataset[features]
y_test=testing_dataset[target_column]
Dtest = xgb.DMatrix(X_test.values)
#
X_pred=prediction_dataset[features]
y_pred=prediction_dataset[target_column]
Dpred = xgb.DMatrix(X_pred.values)
In [8]:
nrounds = 500
esr=100
kfold=5
xgb_params = {
    'seed': 42,
    'eta': 0.01,
    'colsample_bytree': 1, 
    'silent': 1,
    'subsample': 1, 
    'max_depth': 6,
    'gamma': 0.3, 
    'min_child_weight': 3 
}
In [9]:
kfold = 5
In [10]:
from sklearn.metrics import mean_absolute_error
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
In [11]:
training_dataset[prediction_column_cv] = 0
testing_dataset[prediction_column_cv] = 0
prediction_dataset[prediction_column_cv] = 0
#
trn_gini_l = list()
trn_mae_l = list()
trn_rmse_l = list()
test_gini_l = list()
test_mae_l = list()
test_rmse_l = list()
#
for i in range(0,kfold):
    print(' fold: {}  of  {} : '.format(i+1, kfold))
    training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
    validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
        
    X_train =  training_dataset_fold[features].copy()
    X_valid =  validation_dataset[features].copy()        
    y_train =  training_dataset_fold[target_column].copy()
    y_valid =  validation_dataset[target_column].copy()       
                
  
    #preparing for XGB run
    X_train = X_train.values
    X_valid = X_valid.values
    #
    y_pred_train=pd.DataFrame(index=y_train.index)
    y_pred_train[prediction_column_cv]=0
    #
    y_train = y_train.values
    y_valid = y_valid.values
    #
    #
    d_train = xgb.DMatrix(X_train, y_train)
    #
    d_valid = xgb.DMatrix(X_valid, y_valid)
    #
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    #applying XGB
    xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
    if (os.path.exists(xgb_model_file) & UseSavedIfExists):
        print('%s file exists. Reading model from the file'%xgb_model_file)
        xgb_model = pickle.load(open(xgb_model_file, 'rb'))
    else:
        print('%s file does not exists. Training model...'%xgb_model_file)
        xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist,  obj=logregobj, feval=evalerror, verbose_eval=100, early_stopping_rounds=esr)
        pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
    
        
    training_dataset[prediction_column_cv]+=  xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)       
    testing_dataset[prediction_column_cv] +=  xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
    prediction_dataset[prediction_column_cv] +=  xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
 fold: 1  of  5 : 
/home/kate/Research/Property/Models/wc_LogRegObj_Reg_XGB_mae_0.model file exists. Reading model from the file
 fold: 2  of  5 : 
/home/kate/Research/Property/Models/wc_LogRegObj_Reg_XGB_mae_1.model file exists. Reading model from the file
 fold: 3  of  5 : 
/home/kate/Research/Property/Models/wc_LogRegObj_Reg_XGB_mae_2.model file exists. Reading model from the file
 fold: 4  of  5 : 
/home/kate/Research/Property/Models/wc_LogRegObj_Reg_XGB_mae_3.model file exists. Reading model from the file
 fold: 5  of  5 : 
/home/kate/Research/Property/Models/wc_LogRegObj_Reg_XGB_mae_4.model file exists. Reading model from the file
In [12]:
training_dataset[prediction_column_cv] = np.exp(training_dataset[prediction_column_cv])
testing_dataset[prediction_column_cv] = np.exp(testing_dataset[prediction_column_cv])
prediction_dataset[prediction_column_cv] = np.exp(prediction_dataset[prediction_column_cv])
In [13]:
trn_gini_l.append(NormalizedWeightedGini(np.exp(prediction_dataset[prediction_dataset.cal_year>2019][target_column]),prediction_dataset[prediction_dataset.cal_year>2019][prediction_column_cv],prediction_dataset[prediction_dataset.cal_year>2019]['cova_ic_nc_water']))
trn_mae_l.append(mae(np.exp(training_dataset[target_column]),training_dataset[prediction_column_cv]))
trn_rmse_l.append(rmse(np.exp(training_dataset[target_column]),training_dataset[prediction_column_cv]))
    #
test_gini_l.append(NormalizedWeightedGini(np.exp(testing_dataset[target_column]),testing_dataset[prediction_column_cv],testing_dataset.cova_ic_nc_water))
test_mae_l.append(mae(np.exp(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column]),testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))
test_rmse_l.append(rmse(np.exp(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column]),testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))
In [14]:
ScoresFinal = pd.DataFrame(list(zip(trn_gini_l, trn_mae_l, trn_rmse_l, test_gini_l, test_mae_l, test_rmse_l )), 
               columns =['trn_gini','trn_mae','trn_rmse','test_gini','test_mae','test_rmse'])
ScoresFinal
Out[14]:
trn_gini trn_mae trn_rmse test_gini test_mae test_rmse
0 0.29784 9674.240242 22346.456421 0.284799 11245.016619 22120.953015
In [15]:
#-----------------------------------------------------------------------------------------------------------
training_dataset.to_csv('%sproperty_wcs_training_for_normal.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir,header=True,index=False)