import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
import sys
sys.path.append('/home/kate/code/Utils/')
from MyFunctions import NormalizedWeightedGini
from MyFunctions import nLogLik_XGBoost
training_dataset = pd.read_csv('/home/kate/Research/Property/Data/property_wcf_training.csv', error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('/home/kate/Research/Property/Data/property_wcf_testing.csv', error_bad_lines=False, index_col=False)
features=[
'yearbuilt',
'water_risk_3_blk',
'water_risk_fre_3_blk',
'water_risk_sev_3_blk',
'sqft',
'cova_limit',
'cova_deductible',
'protectionclass',
'fixture_leak_3_blk',
'waterh_fail_3_blk',
'usagetype_encd',
'roofcd_encd',
'fire_risk_model_score',
'pipe_froze_3_blk',
'plumb_leak_3_blk',
'appl_fail_3_blk',
'constractioncd_encd',
'ustructure_fail_3_blk',
'stories'
]
resample_features=[
'yearbuilt',
'water_risk_3_blk',
'water_risk_fre_3_blk',
'water_risk_sev_3_blk',
'sqft',
'cova_limit',
'cova_deductible',
'protectionclass',
'fixture_leak_3_blk',
'waterh_fail_3_blk',
'usagetype_encd',
'roofcd_encd',
'fire_risk_model_score',
'pipe_froze_3_blk',
'plumb_leak_3_blk',
'appl_fail_3_blk',
'constractioncd_encd',
'ustructure_fail_3_blk',
'stories',
'log_ecy'
]
target_column = 'cova_ic_nc_water'
offset_column='log_ecy'
prediction_column='xgb_p'
X_test=testing_dataset[features]
y_test=testing_dataset[target_column]
offset_test=testing_dataset[offset_column]
Dtest = xgb.DMatrix(X_test.values)
Dtest.set_base_margin(offset_test.values)
X_train_original=training_dataset[features]
y_train_original=training_dataset[target_column]
offset_train=training_dataset[offset_column]
Dtrain_original = xgb.DMatrix(X_train_original.values)
Dtrain_original.set_base_margin(offset_train.values)
from collections import Counter
Counter(training_dataset[target_column])
According to Negative Binimial model the expected theoretical number of records with 0 claims is 42833 (R script) Let's reduce to this ratio:
OptimalNumberOfZeroClaims=50000
nrounds = 5000
simon_params = {
'objective': 'count:poisson',
'eval_metric': 'poisson-nloglik',
'silent': True,
'booster': 'gbtree',
'eta': 0.01,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'max_depth': 6,
'gamma': 0,
'seed': 42}
log = pd.DataFrame(columns=['Method','gini_train', 'nLogLik_train', 'gini_test', 'nLogLik_test', 'predicted_1_test', 'predicted_1_test_match','% match'])
from imblearn.datasets import make_imbalance
X=training_dataset[resample_features]
y=training_dataset[target_column]
X_res, y_res = make_imbalance(X, y,
sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},
random_state=42)
X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)
xgb_model = xgb.train(simon_params, Dtrain, nrounds)
training_dataset[prediction_column] = xgb_model.predict(Dtrain_original)
testing_dataset[prediction_column] = xgb_model.predict(Dtest)
gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])
gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
log_l = ['make_imbalance', gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True)
print(log_l)
from imblearn.under_sampling import RandomUnderSampler
X=training_dataset[resample_features]
y=training_dataset[target_column]
rus = RandomUnderSampler(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7}, random_state=42)
X_res, y_res = rus.fit_resample(X, y)
X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)
xgb_model = xgb.train(simon_params, Dtrain, nrounds)
training_dataset[prediction_column] = xgb_model.predict(Dtrain_original)
testing_dataset[prediction_column] = xgb_model.predict(Dtest)
gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])
gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
log_l = ['RandomUnderSampler', gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True)
print(log_l)
from imblearn.under_sampling import NearMiss
X=training_dataset[resample_features]
y=training_dataset[target_column]
nm = NearMiss(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7})
X_res, y_res = nm.fit_resample(X, y)
X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)
xgb_model = xgb.train(simon_params, Dtrain, nrounds)
training_dataset[prediction_column] = xgb_model.predict(Dtrain_original)
testing_dataset[prediction_column] = xgb_model.predict(Dtest)
gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])
gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
log_l = ['NearMiss', gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True)
print(log_l)
from imblearn.under_sampling import InstanceHardnessThreshold
X=training_dataset[resample_features]
y=training_dataset[target_column]
iht = InstanceHardnessThreshold(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},random_state=42)
X_res, y_res = iht.fit_resample(X, y)
Counter(y_res)
from imblearn.under_sampling import InstanceHardnessThreshold
X=training_dataset[resample_features]
y=training_dataset[target_column]
iht = InstanceHardnessThreshold(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},random_state=42)
X_res, y_res = iht.fit_resample(X, y)
X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)
xgb_model = xgb.train(simon_params, Dtrain, nrounds)
training_dataset[prediction_column] = xgb_model.predict(Dtrain_original)
testing_dataset[prediction_column] = xgb_model.predict(Dtest)
gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])
gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
log_l = ['InstanceHardnessThreshold', gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True)
print(log_l)
log