library(MASS)
library(lmtest)
library(pscl)
library(xgboost)
library(ggplot2)
library(caret)
source('/home/kate/code/Utils/MyFunctions.R')
ModelsDir <- '/home/kate/Research/Property/Models/'
UseSavedIfExists <- TRUE
training_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_training.csv", header=TRUE)
testing_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_testing.csv", header=TRUE)
NB work with factors and because some numerical columns are factors (ordered) I convert them in factors explicitly. I may need to do this for some other columns which are Yes/No.
training_data$appl_fail_3_blk <- as.factor( training_data$appl_fail_3_blk )
training_data$fixture_leak_3_blk <- as.factor( training_data$fixture_leak_3_blk )
training_data$pipe_froze_3_blk <- as.factor( training_data$pipe_froze_3_blk )
training_data$plumb_leak_3_blk <- as.factor( training_data$plumb_leak_3_blk )
training_data$rep_cost_3_blk <- as.factor( training_data$rep_cost_3_blk )
training_data$ustructure_fail_3_blk <- as.factor( training_data$ustructure_fail_3_blk )
training_data$waterh_fail_3_blk <- as.factor( training_data$waterh_fail_3_blk )
testing_data$appl_fail_3_blk <- as.factor( testing_data$appl_fail_3_blk )
testing_data$fixture_leak_3_blk <- as.factor( testing_data$fixture_leak_3_blk )
testing_data$pipe_froze_3_blk <- as.factor( testing_data$pipe_froze_3_blk )
testing_data$plumb_leak_3_blk <- as.factor( testing_data$plumb_leak_3_blk )
testing_data$rep_cost_3_blk <- as.factor( testing_data$rep_cost_3_blk )
testing_data$ustructure_fail_3_blk <- as.factor( testing_data$ustructure_fail_3_blk )
testing_data$waterh_fail_3_blk <- as.factor( testing_data$waterh_fail_3_blk )
max_yearbuilt <- max(training_data$yearbuilt)
max_sqft <- max(training_data$sqft)
max_water_risk_3_blk <- max(training_data$water_risk_3_blk)
max_water_risk_fre_3_blk <- max(training_data$water_risk_fre_3_blk)
max_water_risk_sev_3_blk <- max(training_data$water_risk_sev_3_blk)
training_data$log_norm_yearbult <- log(training_data$yearbuilt/max_yearbuilt)
training_data$log_norm_sqft <- log(training_data$sqft/max_sqft)
training_data$log_norm_water_risk_3_blk <- log(training_data$water_risk_3_blk/max_water_risk_3_blk)
training_data$log_norm_water_risk_fre_3_blk <- log(training_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
training_data$log_norm_water_risk_sev_3_blk <- log(training_data$sqft/max_water_risk_sev_3_blk)
testing_data$log_norm_yearbult <- log(testing_data$yearbuilt/max_yearbuilt)
testing_data$log_norm_sqft <- log(testing_data$sqft/max_sqft)
testing_data$log_norm_water_risk_3_blk <- log(testing_data$water_risk_3_blk/max_water_risk_3_blk)
testing_data$log_norm_water_risk_fre_3_blk <- log(testing_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
testing_data$log_norm_water_risk_sev_3_blk <- log(testing_data$sqft/max_water_risk_sev_3_blk)
training_data$stories <- as.factor(training_data$stories)
training_data$units <- as.factor(training_data$units)
training_data$secondaryresidence <- as.factor(training_data$secondaryresidence)
training_data$sprinklersystem <- as.factor(training_data$sprinklersystem)
training_data$firealarmtype <- as.factor(training_data$firealarmtype)
training_data$waterdetectiondevice <- as.factor(training_data$waterdetectiondevice)
training_data$propertymanager <- as.factor(training_data$propertymanager)
training_data$cova_deductible <- as.factor(training_data$cova_deductible)
training_data$cova_limit <- as.factor(training_data$cova_limit )
training_data$waterded <- as.factor(training_data$waterded)
training_data$protectionclass <- as.factor(training_data$protectionclass)
training_data$fire_risk_model_score <- as.factor(training_data$fire_risk_model_score)
training_data$ordinanceorlawpct <- as.factor(training_data$ordinanceorlawpct)
training_data$functionalreplacementcost <- as.factor(training_data$functionalreplacementcost)
training_data$homegardcreditind <- as.factor(training_data$homegardcreditind)
training_data$landlordind <- as.factor(training_data$landlordind)
training_data$rentersinsurance <- as.factor(training_data$rentersinsurance)
training_data$burglaryalarmtype <- as.factor(training_data$burglaryalarmtype)
training_data$propertymanager <- as.factor(training_data$propertymanager)
testing_data$stories <- as.factor(testing_data$stories)
testing_data$units <- as.factor(testing_data$units)
testing_data$secondaryresidence <- as.factor(testing_data$secondaryresidence)
testing_data$sprinklersystem <- as.factor(testing_data$sprinklersystem)
testing_data$firealarmtype <- as.factor(testing_data$firealarmtype)
testing_data$waterdetectiondevice <- as.factor(testing_data$waterdetectiondevice)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
testing_data$cova_deductible <- as.factor(testing_data$cova_deductible)
testing_data$cova_limit <- as.factor(testing_data$cova_limit )
testing_data$waterded <- as.factor(testing_data$waterded)
testing_data$protectionclass <- as.factor(testing_data$protectionclass)
testing_data$fire_risk_model_score <- as.factor(testing_data$fire_risk_model_score)
testing_data$ordinanceorlawpct <- as.factor(testing_data$ordinanceorlawpct)
testing_data$functionalreplacementcost <- as.factor(testing_data$functionalreplacementcost)
testing_data$homegardcreditind <- as.factor(testing_data$homegardcreditind)
testing_data$landlordind <- as.factor(testing_data$landlordind)
testing_data$rentersinsurance <- as.factor(testing_data$rentersinsurance)
testing_data$burglaryalarmtype <- as.factor(testing_data$burglaryalarmtype)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
#unique numclaims values
numclaims <- unique(training_data$cova_ic_nc_water)
formula_all <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
roofcd +
units +
occupancycd +
waterded +
protectionclass +
constructioncd +
fire_risk_model_score +
usagetype +
secondaryresidence +
ordinanceorlawpct +
functionalreplacementcost +
homegardcreditind +
sprinklersystem +
landlordind +
rentersinsurance +
firealarmtype +
burglaryalarmtype +
waterdetectiondevice +
propertymanager +
#cova_deductible +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
#log_norm_water_risk_sev_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk +
rep_cost_3_blk +
ustructure_fail_3_blk +
waterh_fail_3_blk
ModelFile <- paste(ModelsDir,"zinb_all.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
zinb_all <- readRDS(ModelFile)
} else {
zinb_all <- zeroinfl(formula_all,offset=log_ecy,data=training_data,dist = "negbin",link= "logit")
saveRDS(zinb_all, ModelFile)
}
summary(zinb_all)
Normalization is needed to make the model work. I was not able to add into the model and test cova_deductible and log_norm_water_risk_sev_3_blk. It returns errors. usagetype does not returns NA in covariance matrix for some levels as in ZI Poisson but stories(3) and fire risk model score do. Some predictors (like ordinanceorlawpct) are marked as significant but really it's a random component I feel.
formula_sign <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
waterded +
constructioncd +
usagetype +
landlordind +
firealarmtype +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk |
log_norm_yearbult +
log_norm_sqft +
roofcd +
occupancycd +
constructioncd +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
pipe_froze_3_blk +
waterh_fail_3_blk
ModelFile <- paste(ModelsDir,"zip_sign.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
zinb_sign <- readRDS(ModelFile)
} else {
zinb_sign <- zeroinfl(formula_sign,offset=log(exposure),data=training_data,dist = "negbin",link= "logit")
saveRDS(zinb_sign, ModelFile)
}
summary(zinb_sign)
No NA for teh levels as in the complex model. Some predictors which were significant now are not significant.
lrtest(zinb_sign,zinb_all)
vuong(zinb_sign,zinb_all)
The results is a little bit contraversial. According to Raw, the complex model is better then the nested one. However Significant features model is better because it has smaller AIC and BIC and it's significant.
models <- list("All Features" = zinb_all, "Sign Features"=zinb_sign)
#Log likelyhood for all the models and DF
Models_LogLik <- rbind(logLik = sapply(models, function(x) round(logLik(x), digits = 0)),
Df = sapply(models, function(x) attr(logLik(x), "df")))
Models_LogLik
#AIC
Models_AIC <- rbind(AIC = sapply(models, function(x) round(AIC(x), digits = 0)))
Models_AIC
#BIC
Models_BIC <- rbind(BIC = sapply(models, function(x) round(BIC(x), digits = 0)))
Models_BIC
Smaller AIC (BIC) suggests a better model. The model with significant features has a smaller AIC and BIC.
sapply(models, function(x) coef(x)[1:20])
Sometimes the sign of the mean is different in the models
library(countreg)
par(mfrow = c(1, 2))
rootogram(zinb_all,max = max(numclaims),main="All Features",style="hanging")
rootogram(zinb_sign,max = max(numclaims),main="Significant Features",style="hanging")
Significant features model has deviance on numebr claims 2
I use XGB to predict the final result based on probabilities of number claims in each row.
param0 <- list(
"objective" = "count:poisson"
, "eval_metric" = "poisson-nloglik"
)
#Adding predicted probability values
#to training and testing dataset for each numclaims
#as separate column
training_all <- predict(zinb_all, training_data, type='prob')
testing_all <- predict(zinb_all, testing_data, type='prob')
training_sign <- predict(zinb_sign, training_data, type='prob')
testing_sign <- predict(zinb_sign, testing_data, type='prob')
for(unique_value in numclaims){
training_data[paste("zinb_all", unique_value, sep = ".")] <- training_all[,as.numeric(unique_value)+1]
testing_data[paste("zinb_all", unique_value, sep = ".")] <- testing_all[,as.numeric(unique_value)+1]
training_data[paste("zinb_sign", unique_value, sep = ".")] <- training_sign[,as.numeric(unique_value)+1]
testing_data[paste("zinb_sign", unique_value, sep = ".")] <- testing_sign[,as.numeric(unique_value)+1]
}
#Features with probabilities listed by model
zinb_all_features <- list();
zinb_sign_features <- list();
i <- 1
for(unique_value in numclaims){
zinb_all_features[[i]] <- paste("zinb_all", unique_value, sep = ".")
zinb_sign_features[[i]] <- paste("zinb_sign", unique_value, sep = ".")
i <- i + 1
}
xgNB_all_training = xgb.DMatrix(as.matrix(training_data[,unlist(zinb_all_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgNB_all_training, "base_margin", log(training_data$ecy))
xgNB_all_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(zinb_all_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgNB_all_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgNB_all_training
)
training_data$zinb_all = predict(xgb, xgNB_all_training)
testing_data$zinb_all = predict(xgb, xgNB_all_testing)
xgNB_sign_training = xgb.DMatrix(as.matrix(training_data[,unlist(zinb_sign_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgNB_sign_training, "base_margin", log(training_data$ecy))
xgNB_sign_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(zinb_sign_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgNB_sign_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgNB_sign_training
)
training_data$zinb_sign = predict(xgb, xgNB_sign_training)
testing_data$zinb_sign = predict(xgb, xgNB_sign_testing)
As an accuracy measure I use Normalized Weighted Gini:
Testing_Gini <- data.frame ("All" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$zinb_all,testing_data$ecy)
,"Significant Only" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$zinb_sign,testing_data$ecy)
)
Testing_Gini
Overall, the models are very close to each other. There is a difference in Normalized Weighted Gini which does NOT looks significant.
TBD Does not work with Caret very strightforward and my manual approach is wrong somewhere