library(MASS)
library(lmtest)
library(pscl)
library(xgboost)
library(ggplot2)
library(caret)
source('/home/kate/code/Utils/MyFunctions.R')
ModelsDir <- '/home/kate/Research/Property/Models/'
UseSavedIfExists <- TRUE
training_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_training.csv", header=TRUE)
testing_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_testing.csv", header=TRUE)
NB work with factors and because some numerical columns are factors (ordered) I convert them in factors explicitly. I may need to do this for some other columns which are Yes/No.
training_data$appl_fail_3_blk <- as.factor( training_data$appl_fail_3_blk )
training_data$fixture_leak_3_blk <- as.factor( training_data$fixture_leak_3_blk )
training_data$pipe_froze_3_blk <- as.factor( training_data$pipe_froze_3_blk )
training_data$plumb_leak_3_blk <- as.factor( training_data$plumb_leak_3_blk )
training_data$rep_cost_3_blk <- as.factor( training_data$rep_cost_3_blk )
training_data$ustructure_fail_3_blk <- as.factor( training_data$ustructure_fail_3_blk )
training_data$waterh_fail_3_blk <- as.factor( training_data$waterh_fail_3_blk )
testing_data$appl_fail_3_blk <- as.factor( testing_data$appl_fail_3_blk )
testing_data$fixture_leak_3_blk <- as.factor( testing_data$fixture_leak_3_blk )
testing_data$pipe_froze_3_blk <- as.factor( testing_data$pipe_froze_3_blk )
testing_data$plumb_leak_3_blk <- as.factor( testing_data$plumb_leak_3_blk )
testing_data$rep_cost_3_blk <- as.factor( testing_data$rep_cost_3_blk )
testing_data$ustructure_fail_3_blk <- as.factor( testing_data$ustructure_fail_3_blk )
testing_data$waterh_fail_3_blk <- as.factor( testing_data$waterh_fail_3_blk )
max_yearbuilt <- max(training_data$yearbuilt)
max_sqft <- max(training_data$sqft)
max_water_risk_3_blk <- max(training_data$water_risk_3_blk)
max_water_risk_fre_3_blk <- max(training_data$water_risk_fre_3_blk)
max_water_risk_sev_3_blk <- max(training_data$water_risk_sev_3_blk)
training_data$log_norm_yearbult <- log(training_data$yearbuilt/max_yearbuilt)
training_data$log_norm_sqft <- log(training_data$sqft/max_sqft)
training_data$log_norm_water_risk_3_blk <- log(training_data$water_risk_3_blk/max_water_risk_3_blk)
training_data$log_norm_water_risk_fre_3_blk <- log(training_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
training_data$log_norm_water_risk_sev_3_blk <- log(training_data$sqft/max_water_risk_sev_3_blk)
testing_data$log_norm_yearbult <- log(testing_data$yearbuilt/max_yearbuilt)
testing_data$log_norm_sqft <- log(testing_data$sqft/max_sqft)
testing_data$log_norm_water_risk_3_blk <- log(testing_data$water_risk_3_blk/max_water_risk_3_blk)
testing_data$log_norm_water_risk_fre_3_blk <- log(testing_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
testing_data$log_norm_water_risk_sev_3_blk <- log(testing_data$sqft/max_water_risk_sev_3_blk)
training_data$stories <- as.factor(training_data$stories)
training_data$units <- as.factor(training_data$units)
training_data$secondaryresidence <- as.factor(training_data$secondaryresidence)
training_data$sprinklersystem <- as.factor(training_data$sprinklersystem)
training_data$firealarmtype <- as.factor(training_data$firealarmtype)
training_data$waterdetectiondevice <- as.factor(training_data$waterdetectiondevice)
training_data$propertymanager <- as.factor(training_data$propertymanager)
training_data$cova_deductible <- as.factor(training_data$cova_deductible)
training_data$cova_limit <- as.factor(training_data$cova_limit )
training_data$waterded <- as.factor(training_data$waterded)
training_data$protectionclass <- as.factor(training_data$protectionclass)
training_data$fire_risk_model_score <- as.factor(training_data$fire_risk_model_score)
training_data$ordinanceorlawpct <- as.factor(training_data$ordinanceorlawpct)
training_data$functionalreplacementcost <- as.factor(training_data$functionalreplacementcost)
training_data$homegardcreditind <- as.factor(training_data$homegardcreditind)
training_data$landlordind <- as.factor(training_data$landlordind)
training_data$rentersinsurance <- as.factor(training_data$rentersinsurance)
training_data$burglaryalarmtype <- as.factor(training_data$burglaryalarmtype)
training_data$propertymanager <- as.factor(training_data$propertymanager)
testing_data$stories <- as.factor(testing_data$stories)
testing_data$units <- as.factor(testing_data$units)
testing_data$secondaryresidence <- as.factor(testing_data$secondaryresidence)
testing_data$sprinklersystem <- as.factor(testing_data$sprinklersystem)
testing_data$firealarmtype <- as.factor(testing_data$firealarmtype)
testing_data$waterdetectiondevice <- as.factor(testing_data$waterdetectiondevice)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
testing_data$cova_deductible <- as.factor(testing_data$cova_deductible)
testing_data$cova_limit <- as.factor(testing_data$cova_limit )
testing_data$waterded <- as.factor(testing_data$waterded)
testing_data$protectionclass <- as.factor(testing_data$protectionclass)
testing_data$fire_risk_model_score <- as.factor(testing_data$fire_risk_model_score)
testing_data$ordinanceorlawpct <- as.factor(testing_data$ordinanceorlawpct)
testing_data$functionalreplacementcost <- as.factor(testing_data$functionalreplacementcost)
testing_data$homegardcreditind <- as.factor(testing_data$homegardcreditind)
testing_data$landlordind <- as.factor(testing_data$landlordind)
testing_data$rentersinsurance <- as.factor(testing_data$rentersinsurance)
testing_data$burglaryalarmtype <- as.factor(testing_data$burglaryalarmtype)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
#unique numclaims values
numclaims <- unique(training_data$cova_ic_nc_water)
formula_sign <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
#stories +
#roofcd +
#units +
#waterded +
#protectionclass +
#constructioncd +
#sprinklersystem +
#landlordind +
#firealarmtype +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
#pipe_froze_3_blk +
plumb_leak_3_blk +
waterh_fail_3_blk |
log_norm_yearbult +
log_norm_sqft +
stories +
roofcd +
units +
waterded +
protectionclass +
constructioncd +
sprinklersystem +
landlordind +
firealarmtype +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk +
waterh_fail_3_blk
UseSavedIfExists <- FALSE
ModelFile <- paste(ModelsDir,"hurdlenb_sign.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
hurdlenb_sign <- readRDS(ModelFile)
} else {
hurdlenb_sign <- hurdle(formula_sign, data = training_data, offset=log_ecy,dist ="negbin",zero.dist = "negbin",link= "logit")
saveRDS(hurdlenb_sign, ModelFile)
}
summary(hurdlenb_sign)
Hurdle NB model is even more sensitive to the predictors then Hurdle Poisson or Zero Inflated. The predictors above are only I was able to add to get non NAN resullts.
library(countreg)
par(mfrow = c(1, 2))
rootogram(hurdlenb_sign,max = max(numclaims),main="Significant Features",style="hanging")
I use XGB to predict the final result based on probabilities of number claims in each row.
param0 <- list(
"objective" = "count:poisson"
, "eval_metric" = "poisson-nloglik"
)
#Adding predicted probability values
#to training and testing dataset for each numclaims
#as separate column
training_sign <- predict(hurdlenb_sign, training_data, type='prob')
testing_sign <- predict(hurdlenb_sign, testing_data, type='prob')
for(unique_value in numclaims){
training_data[paste("hurdlenb_sign", unique_value, sep = ".")] <- training_sign[,as.numeric(unique_value)+1]
testing_data[paste("hurdlenb_sign", unique_value, sep = ".")] <- testing_sign[,as.numeric(unique_value)+1]
}
#Features with probabilities listed by model
HNB_sign_features <- list();
i <- 1
for(unique_value in numclaims){
HNB_sign_features[[i]] <- paste("hurdlenb_sign", unique_value, sep = ".")
i <- i + 1
}
xgHNB_sign_training = xgb.DMatrix(as.matrix(training_data[,unlist(HNB_sign_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgHNB_sign_training, "base_margin", log(training_data$ecy))
xgHNB_sign_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(HNB_sign_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgHNB_sign_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgHNB_sign_training
)
training_data$hurdlenb_sign = predict(xgb, xgHNB_sign_training)
testing_data$hurdlenb_sign = predict(xgb, xgHNB_sign_testing)
As an accuracy measure I use Normalized Weighted Gini:
Testing_Gini <- data.frame ("Significant Only" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$hurdlenb_sign,testing_data$ecy)
)
Testing_Gini
TBD Does not work with Caret very strightforward and my manual approach is wrong somewhere