In [66]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
In [67]:
import sys

sys.path.append('/home/kate/code/Utils/')

from MyFunctions import NormalizedWeightedGini
from MyFunctions import nLogLik_XGBoost
In [68]:
training_dataset = pd.read_csv('/home/kate/Research/Property/Data/property_wcf_training.csv', error_bad_lines=False, index_col=False)
testing_dataset = pd.read_csv('/home/kate/Research/Property/Data/property_wcf_testing.csv', error_bad_lines=False, index_col=False)
In [69]:
features=[
        'yearbuilt',
        'water_risk_3_blk',
        'water_risk_fre_3_blk',
        'water_risk_sev_3_blk',
        'sqft',
        'cova_limit',
        'cova_deductible',
        'protectionclass',
        'fixture_leak_3_blk',
        'waterh_fail_3_blk',
        'usagetype_encd',
        'roofcd_encd',
        'fire_risk_model_score',
        'pipe_froze_3_blk',
        'plumb_leak_3_blk',
        'appl_fail_3_blk',
        'constractioncd_encd',
        'ustructure_fail_3_blk',
        'stories'
        ]
In [70]:
resample_features=[
        'yearbuilt',
        'water_risk_3_blk',
        'water_risk_fre_3_blk',
        'water_risk_sev_3_blk',
        'sqft',
        'cova_limit',
        'cova_deductible',
        'protectionclass',
        'fixture_leak_3_blk',
        'waterh_fail_3_blk',
        'usagetype_encd',
        'roofcd_encd',
        'fire_risk_model_score',
        'pipe_froze_3_blk',
        'plumb_leak_3_blk',
        'appl_fail_3_blk',
        'constractioncd_encd',
        'ustructure_fail_3_blk',
        'stories',
        'log_ecy'
        ]
In [71]:
target_column = 'cova_ic_nc_water'
offset_column='log_ecy'
prediction_column='xgb_p'
In [72]:
X_test=testing_dataset[features]
y_test=testing_dataset[target_column]
offset_test=testing_dataset[offset_column]
Dtest = xgb.DMatrix(X_test.values)
Dtest.set_base_margin(offset_test.values)
In [73]:
X_train_original=training_dataset[features]
y_train_original=training_dataset[target_column]
offset_train=training_dataset[offset_column]
Dtrain_original = xgb.DMatrix(X_train_original.values)
Dtrain_original.set_base_margin(offset_train.values)
In [74]:
from collections import Counter
In [75]:
Counter(training_dataset[target_column])
Out[75]:
Counter({0: 881958, 1: 9412, 2: 213, 3: 7})

According to Negative Binimial model the expected theoretical number of records with 0 claims is 42833 (R script) Let's reduce to this ratio:

In [76]:
OptimalNumberOfZeroClaims=50000
In [77]:
nrounds = 5000
simon_params = {
        'objective': 'count:poisson',
        'eval_metric': 'poisson-nloglik',
        'silent': True,
        'booster': 'gbtree',
        'eta': 0.01,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'max_depth': 6,
        'gamma': 0,
        'seed': 42}
In [78]:
log = pd.DataFrame(columns=['Method','gini_train', 'nLogLik_train', 'gini_test', 'nLogLik_test', 'predicted_1_test', 'predicted_1_test_match','% match'])
In [79]:
from imblearn.datasets import make_imbalance
X=training_dataset[resample_features]
y=training_dataset[target_column]

X_res, y_res = make_imbalance(X, y,
                      sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},
                      random_state=42)

X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)

xgb_model = xgb.train(simon_params, Dtrain, nrounds)

training_dataset[prediction_column] =  xgb_model.predict(Dtrain_original) 
testing_dataset[prediction_column] =  xgb_model.predict(Dtest) 

gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])

gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
        
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
        
log_l = ['make_imbalance',  gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True) 
        
print(log_l)
['make_imbalance', 0.6437747337656646, 0.1608124077320099, 0.43700577223068254, 0.1404825896024704, 1297, 44, 3.392444101773323]
In [80]:
from imblearn.under_sampling import RandomUnderSampler
X=training_dataset[resample_features]
y=training_dataset[target_column]
rus = RandomUnderSampler(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7}, random_state=42)
X_res, y_res = rus.fit_resample(X, y)

X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)

xgb_model = xgb.train(simon_params, Dtrain, nrounds)

training_dataset[prediction_column] =  xgb_model.predict(Dtrain_original) 
testing_dataset[prediction_column] =  xgb_model.predict(Dtest) 

gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])

gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
        
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
        
log_l = ['RandomUnderSampler',  gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True) 
        
print(log_l)
['RandomUnderSampler', 0.6437747337656646, 0.1608124077320099, 0.43700577223068254, 0.1404825896024704, 1297, 44, 3.392444101773323]
In [81]:
from imblearn.under_sampling import NearMiss
X=training_dataset[resample_features]
y=training_dataset[target_column]
nm = NearMiss(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7})
X_res, y_res = nm.fit_resample(X, y)

X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)

xgb_model = xgb.train(simon_params, Dtrain, nrounds)


training_dataset[prediction_column] =  xgb_model.predict(Dtrain_original) 
testing_dataset[prediction_column] =  xgb_model.predict(Dtest) 

gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])

gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
        
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
        
log_l = ['NearMiss',  gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True) 
        
print(log_l)
['NearMiss', 0.03505504473204299, 0.8294628262519836, 0.12487821984977239, 0.8477267026901245, 178366, 1248, 0.6996849175291255]
In [82]:
from imblearn.under_sampling import InstanceHardnessThreshold
X=training_dataset[resample_features]
y=training_dataset[target_column]
iht = InstanceHardnessThreshold(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},random_state=42)
X_res, y_res = iht.fit_resample(X, y)
Counter(y_res)
Out[82]:
Counter({0: 556921, 1: 9412, 2: 213, 3: 7})
In [83]:
from imblearn.under_sampling import InstanceHardnessThreshold
X=training_dataset[resample_features]
y=training_dataset[target_column]
iht = InstanceHardnessThreshold(sampling_strategy={0: OptimalNumberOfZeroClaims, 1: 9412, 2: 213, 3: 7},random_state=42)
X_res, y_res = iht.fit_resample(X, y)

X_train=X_res[features]
y_train=y_res
offset=X_res[offset_column]
Dtrain = xgb.DMatrix(X_train.values,y_train)
Dtrain.set_base_margin(offset.values)

xgb_model = xgb.train(simon_params, Dtrain, nrounds)


training_dataset[prediction_column] =  xgb_model.predict(Dtrain_original) 
testing_dataset[prediction_column] =  xgb_model.predict(Dtest) 

gini_train=NormalizedWeightedGini(training_dataset[target_column],training_dataset[prediction_column],training_dataset['ecy'])
nLogLik_train=nLogLik_XGBoost(training_dataset[target_column],training_dataset[prediction_column])

gini_test=NormalizedWeightedGini(testing_dataset[target_column],testing_dataset[prediction_column],testing_dataset['ecy'])
nLogLik_test=nLogLik_XGBoost(testing_dataset[target_column],testing_dataset[prediction_column])
        
predicted_1_test=testing_dataset[(testing_dataset[prediction_column].round(0) == 1)][prediction_column].count()
predicted_1_test_match=testing_dataset[(testing_dataset[prediction_column].round(0) == 1) & (testing_dataset[target_column] == 1)][prediction_column].count()
        
log_l = ['InstanceHardnessThreshold',  gini_train, nLogLik_train, gini_test, nLogLik_test, predicted_1_test, predicted_1_test_match, 100*predicted_1_test_match/predicted_1_test]
log = log.append(pd.Series(log_l, index = log.columns), ignore_index=True) 
        
print(log_l)
['InstanceHardnessThreshold', 0.5851750668737364, 0.05803235247731209, 0.4383513064343097, 0.0457218773663044, 59, 2, 3.389830508474576]
In [84]:
log
Out[84]:
Method gini_train nLogLik_train gini_test nLogLik_test predicted_1_test predicted_1_test_match % match
0 make_imbalance 0.643775 0.160812 0.437006 0.140483 1297 44 3.392444
1 RandomUnderSampler 0.643775 0.160812 0.437006 0.140483 1297 44 3.392444
2 NearMiss 0.035055 0.829463 0.124878 0.847727 178366 1248 0.699685
3 InstanceHardnessThreshold 0.585175 0.058032 0.438351 0.045722 59 2 3.389831