ModelsDir = '/home/kate/Research/Property/Models/'
ModelName='wc_Linear_Reg_XGB_mae'
UseSavedIfExists = True
DataDir = '/home/kate/Research/Property/Data/'
import sys
sys.path.append('/home/kate/code/Utils/')
from MyFunctions import NormalizedWeightedGini
from MyFunctions import mae
from MyFunctions import rmse
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import os
training_dataset = pd.read_csv('%sproperty_wcs_training_for_normal.csv'%DataDir, error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('%sproperty_wcf_testing.csv'%DataDir, error_bad_lines=False, index_col=False)
prediction_dataset = pd.read_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir, error_bad_lines=False, index_col=False)
target_column = 'log_cova_il_nc_water'
prediction_column_cv='lin_reg_xgb_mae'
features=[
'cova_deductible',
'roofcd_encd',
'water_risk_sev_3_blk',
'sqft',
'rep_cost_3_blk',
'yearbuilt',
'ecy',
'usagetype_encd'
]
X=training_dataset[features]
y=training_dataset[target_column]
Dtrain = xgb.DMatrix(X.values,y)
#
X_test=testing_dataset[features]
y_test=testing_dataset[target_column]
Dtest = xgb.DMatrix(X_test.values)
#
X_pred=prediction_dataset[features]
y_pred=prediction_dataset[target_column]
Dpred = xgb.DMatrix(X_pred.values)
nrounds = 600
esr=100
xgb_params = {
'seed': 42,
'eta': 0.01,
'colsample_bytree': 0.9,
'silent': 1,
'subsample': 0.9,
'objective': 'reg:linear',
'eval_metric':'mae',
'max_depth': 6,
'gamma': 0.4,
'min_child_weight': 4
}
#Traing gini/mae/rmse: 0.518429 9781.028079 22263.22878
#Test gini/mae/rmse: 0.253657 11287.45169 22739.13832
#Test GLM 0.243 - 22092
kfold = 5
training_dataset[prediction_column_cv] = 0
testing_dataset[prediction_column_cv] = 0
prediction_dataset[prediction_column_cv] = 0
#
trn_gini_l = list()
trn_mae_l = list()
trn_rmse_l = list()
test_gini_l = list()
test_mae_l = list()
test_rmse_l = list()
#
for i in range(0,kfold):
print(' fold: {} of {} : '.format(i+1, kfold))
training_dataset_fold = training_dataset[training_dataset['fold_%s'%i]>0]
validation_dataset = training_dataset[training_dataset['fold_%s'%i]==0]
X_train = training_dataset_fold[features].copy()
X_valid = validation_dataset[features].copy()
y_train = training_dataset_fold[target_column].copy()
y_valid = validation_dataset[target_column].copy()
#preparing for XGB run
X_train = X_train.values
X_valid = X_valid.values
#
y_pred_train=pd.DataFrame(index=y_train.index)
y_pred_train[prediction_column_cv]=0
#
y_train = y_train.values
y_valid = y_valid.values
#
#
d_train = xgb.DMatrix(X_train, y_train)
#
d_valid = xgb.DMatrix(X_valid, y_valid)
#
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#applying XGB
xgb_model_file='%s%s_%s.model'%(ModelsDir,ModelName,i)
if (os.path.exists(xgb_model_file) & UseSavedIfExists):
print('%s file exists. Reading model from the file'%xgb_model_file)
xgb_model = pickle.load(open(xgb_model_file, 'rb'))
else:
print('%s file does not exists. Training model...'%xgb_model_file)
xgb_model = xgb.train(xgb_params, d_train, nrounds, watchlist, verbose_eval=100, early_stopping_rounds=esr)
pickle.dump(xgb_model, open(xgb_model_file, 'wb'))
training_dataset[prediction_column_cv]+= xgb_model.predict(Dtrain, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
testing_dataset[prediction_column_cv] += xgb_model.predict(Dtest, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
prediction_dataset[prediction_column_cv] += xgb_model.predict(Dpred, ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)
training_dataset[prediction_column_cv] = np.exp(training_dataset[prediction_column_cv])
testing_dataset[prediction_column_cv] = np.exp(testing_dataset[prediction_column_cv])
prediction_dataset[prediction_column_cv] = np.exp(prediction_dataset[prediction_column_cv])
trn_gini_l.append(NormalizedWeightedGini(np.exp(prediction_dataset[prediction_dataset.cal_year>2019][target_column]),prediction_dataset[prediction_dataset.cal_year>2019][prediction_column_cv],prediction_dataset[prediction_dataset.cal_year>2019]['cova_ic_nc_water']))
trn_mae_l.append(mae(np.exp(training_dataset[target_column]),training_dataset[prediction_column_cv]))
trn_rmse_l.append(rmse(np.exp(training_dataset[target_column]),training_dataset[prediction_column_cv]))
#
test_gini_l.append(NormalizedWeightedGini(np.exp(testing_dataset[target_column]),testing_dataset[prediction_column_cv],testing_dataset.cova_ic_nc_water))
test_mae_l.append(mae(np.exp(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column]),testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))
test_rmse_l.append(rmse(np.exp(testing_dataset[testing_dataset.cova_ic_nc_water>0][target_column]),testing_dataset[testing_dataset.cova_ic_nc_water>0][prediction_column_cv]))
ScoresFinal = pd.DataFrame(list(zip(trn_gini_l, trn_mae_l, trn_rmse_l, test_gini_l, test_mae_l, test_rmse_l )),
columns =['trn_gini','trn_mae','trn_rmse','test_gini','test_mae','test_rmse'])
ScoresFinal
#-----------------------------------------------------------------------------------------------------------
training_dataset.to_csv('%sproperty_wcs_training_for_normal.csv'%DataDir,header=True,index=False)
testing_dataset.to_csv('%sproperty_wcf_testing.csv'%DataDir,header=True,index=False)
prediction_dataset.to_csv('%sproperty_water_claims_non_cat_fs.csv'%DataDir,header=True,index=False)