library(MASS)
library(lmtest)
library(pscl)
library(xgboost)
library(ggplot2)
library(caret)
source('/home/kate/code/Utils/MyFunctions.R')
ModelsDir <- '/home/kate/Research/Property/Models/'
UseSavedIfExists <- FALSE
training_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_training.csv", header=TRUE)
testing_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_testing.csv", header=TRUE)
NB work with factors and because some numerical columns are factors (ordered) I convert them in factors explicitly. I may need to do this for some other columns which are Yes/No.
training_data$appl_fail_3_blk <- as.factor( training_data$appl_fail_3_blk )
training_data$fixture_leak_3_blk <- as.factor( training_data$fixture_leak_3_blk )
training_data$pipe_froze_3_blk <- as.factor( training_data$pipe_froze_3_blk )
training_data$plumb_leak_3_blk <- as.factor( training_data$plumb_leak_3_blk )
training_data$rep_cost_3_blk <- as.factor( training_data$rep_cost_3_blk )
training_data$ustructure_fail_3_blk <- as.factor( training_data$ustructure_fail_3_blk )
training_data$waterh_fail_3_blk <- as.factor( training_data$waterh_fail_3_blk )
testing_data$appl_fail_3_blk <- as.factor( testing_data$appl_fail_3_blk )
testing_data$fixture_leak_3_blk <- as.factor( testing_data$fixture_leak_3_blk )
testing_data$pipe_froze_3_blk <- as.factor( testing_data$pipe_froze_3_blk )
testing_data$plumb_leak_3_blk <- as.factor( testing_data$plumb_leak_3_blk )
testing_data$rep_cost_3_blk <- as.factor( testing_data$rep_cost_3_blk )
testing_data$ustructure_fail_3_blk <- as.factor( testing_data$ustructure_fail_3_blk )
testing_data$waterh_fail_3_blk <- as.factor( testing_data$waterh_fail_3_blk )
max_yearbuilt <- max(training_data$yearbuilt)
max_sqft <- max(training_data$sqft)
max_water_risk_3_blk <- max(training_data$water_risk_3_blk)
max_water_risk_fre_3_blk <- max(training_data$water_risk_fre_3_blk)
max_water_risk_sev_3_blk <- max(training_data$water_risk_sev_3_blk)
training_data$log_norm_yearbult <- log(training_data$yearbuilt/max_yearbuilt)
training_data$log_norm_sqft <- log(training_data$sqft/max_sqft)
training_data$log_norm_water_risk_3_blk <- log(training_data$water_risk_3_blk/max_water_risk_3_blk)
training_data$log_norm_water_risk_fre_3_blk <- log(training_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
training_data$log_norm_water_risk_sev_3_blk <- log(training_data$sqft/max_water_risk_sev_3_blk)
testing_data$log_norm_yearbult <- log(testing_data$yearbuilt/max_yearbuilt)
testing_data$log_norm_sqft <- log(testing_data$sqft/max_sqft)
testing_data$log_norm_water_risk_3_blk <- log(testing_data$water_risk_3_blk/max_water_risk_3_blk)
testing_data$log_norm_water_risk_fre_3_blk <- log(testing_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
testing_data$log_norm_water_risk_sev_3_blk <- log(testing_data$sqft/max_water_risk_sev_3_blk)
training_data$stories <- as.factor(training_data$stories)
training_data$units <- as.factor(training_data$units)
training_data$secondaryresidence <- as.factor(training_data$secondaryresidence)
training_data$sprinklersystem <- as.factor(training_data$sprinklersystem)
training_data$firealarmtype <- as.factor(training_data$firealarmtype)
training_data$waterdetectiondevice <- as.factor(training_data$waterdetectiondevice)
training_data$propertymanager <- as.factor(training_data$propertymanager)
training_data$cova_deductible <- as.factor(training_data$cova_deductible)
training_data$cova_limit <- as.factor(training_data$cova_limit )
training_data$waterded <- as.factor(training_data$waterded)
training_data$protectionclass <- as.factor(training_data$protectionclass)
training_data$fire_risk_model_score <- as.factor(training_data$fire_risk_model_score)
training_data$ordinanceorlawpct <- as.factor(training_data$ordinanceorlawpct)
training_data$functionalreplacementcost <- as.factor(training_data$functionalreplacementcost)
training_data$homegardcreditind <- as.factor(training_data$homegardcreditind)
training_data$landlordind <- as.factor(training_data$landlordind)
training_data$rentersinsurance <- as.factor(training_data$rentersinsurance)
training_data$burglaryalarmtype <- as.factor(training_data$burglaryalarmtype)
training_data$propertymanager <- as.factor(training_data$propertymanager)
testing_data$stories <- as.factor(testing_data$stories)
testing_data$units <- as.factor(testing_data$units)
testing_data$secondaryresidence <- as.factor(testing_data$secondaryresidence)
testing_data$sprinklersystem <- as.factor(testing_data$sprinklersystem)
testing_data$firealarmtype <- as.factor(testing_data$firealarmtype)
testing_data$waterdetectiondevice <- as.factor(testing_data$waterdetectiondevice)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
testing_data$cova_deductible <- as.factor(testing_data$cova_deductible)
testing_data$cova_limit <- as.factor(testing_data$cova_limit )
testing_data$waterded <- as.factor(testing_data$waterded)
testing_data$protectionclass <- as.factor(testing_data$protectionclass)
testing_data$fire_risk_model_score <- as.factor(testing_data$fire_risk_model_score)
testing_data$ordinanceorlawpct <- as.factor(testing_data$ordinanceorlawpct)
testing_data$functionalreplacementcost <- as.factor(testing_data$functionalreplacementcost)
testing_data$homegardcreditind <- as.factor(testing_data$homegardcreditind)
testing_data$landlordind <- as.factor(testing_data$landlordind)
testing_data$rentersinsurance <- as.factor(testing_data$rentersinsurance)
testing_data$burglaryalarmtype <- as.factor(testing_data$burglaryalarmtype)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
#unique numclaims values
numclaims <- unique(training_data$cova_ic_nc_water)
formula_all <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
roofcd +
units +
occupancycd +
waterded +
protectionclass +
constructioncd +
fire_risk_model_score +
usagetype +
secondaryresidence +
ordinanceorlawpct +
functionalreplacementcost +
homegardcreditind +
sprinklersystem +
landlordind +
rentersinsurance +
firealarmtype +
burglaryalarmtype +
waterdetectiondevice +
propertymanager +
#cova_deductible +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
#log_norm_water_risk_sev_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk +
rep_cost_3_blk +
ustructure_fail_3_blk +
waterh_fail_3_blk
ModelFile <- paste(ModelsDir,"zip_all.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
zip_all <- readRDS(ModelFile)
} else {
zip_all <- zeroinfl(formula_all, data = training_data, offset=log_ecy,dist = "poisson",link= "logit")
saveRDS(zip_all, ModelFile)
}
summary(zip_all)
Normalization is needed to make the model work. I was not able to add into the model and test cova_deductible and log_norm_water_risk_sev_3_blk. It returns errors. usagetype returns NA in covariance matrix for some levels. Some predictors (like ordinanceorlawpct) are marked as significant but really it's a random component I feel.
formula_sign <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
waterded +
constructioncd +
usagetype +
landlordind +
firealarmtype +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk |
log_norm_yearbult +
log_norm_sqft +
roofcd +
occupancycd +
protectionclass +
constructioncd +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
pipe_froze_3_blk
ModelFile <- paste(ModelsDir,"zip_sign.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
zip_sign <- readRDS(ModelFile)
} else {
zip_sign <- zeroinfl(formula_sign, data = training_data, offset=log_ecy,dist = "poisson",link= "logit")
saveRDS(zip_sign, ModelFile)
}
summary(zip_sign)
Covariance matrix estimates are definetly better then in the complex model
lrtest(zip_sign,zip_all)
vuong(zip_sign,zip_all)
The results is a little bit contraversial. According to Raw, the complex model is better then the nested one. However Significant features model is better because it has smaller AIC and BIC and it's significant.
models <- list("All Features" = zip_all, "Sign Features"=zip_sign)
#Log likelyhood for all the models and DF
Models_LogLik <- rbind(logLik = sapply(models, function(x) round(logLik(x), digits = 0)),
Df = sapply(models, function(x) attr(logLik(x), "df")))
Models_LogLik
#AIC
Models_AIC <- rbind(AIC = sapply(models, function(x) round(AIC(x), digits = 0)))
Models_AIC
#BIC
Models_BIC <- rbind(BIC = sapply(models, function(x) round(BIC(x), digits = 0)))
Models_BIC
Smaller AIC (BIC) suggests a better model. The model with significant features has a smaller AIC and BIC then the model with only significant features.
sapply(models, function(x) coef(x)[1:20])
Sometimes the sign of the mean is different in the models
library(countreg)
par(mfrow = c(1, 2))
rootogram(zip_all,max = max(numclaims),main="All Features",style="hanging")
rootogram(zip_sign,max = max(numclaims),main="Significant Features",style="hanging")
All features model has deviance on numebr claims 2 and looks like more for only Significant features model
I use XGB to predict the final result based on probabilities of number claims in each row.
param0 <- list(
"objective" = "count:poisson"
, "eval_metric" = "poisson-nloglik"
)
#Adding predicted probability values
#to training and testing dataset for each numclaims
#as separate column
training_all <- predict(zip_all, training_data, type='prob')
testing_all <- predict(zip_all, testing_data, type='prob')
training_sign <- predict(zip_sign, training_data, type='prob')
testing_sign <- predict(zip_sign, testing_data, type='prob')
for(unique_value in numclaims){
training_data[paste("zip_all", unique_value, sep = ".")] <- training_all[,as.numeric(unique_value)+1]
testing_data[paste("zip_all", unique_value, sep = ".")] <- testing_all[,as.numeric(unique_value)+1]
training_data[paste("zip_sign", unique_value, sep = ".")] <- training_sign[,as.numeric(unique_value)+1]
testing_data[paste("zip_sign", unique_value, sep = ".")] <- testing_sign[,as.numeric(unique_value)+1]
}
#Features with probabilities listed by model
zip_all_features <- list();
zip_sign_features <- list();
i <- 1
for(unique_value in numclaims){
zip_all_features[[i]] <- paste("zip_all", unique_value, sep = ".")
zip_sign_features[[i]] <- paste("zip_sign", unique_value, sep = ".")
i <- i + 1
}
xgZIP_all_training = xgb.DMatrix(as.matrix(training_data[,unlist(zip_all_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgZIP_all_training, "base_margin", log(training_data$ecy))
xgZIP_all_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(zip_all_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgZIP_all_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgZIP_all_training
)
training_data$zip_all = predict(xgb, xgZIP_all_training)
testing_data$zip_all = predict(xgb, xgZIP_all_testing)
xgZIP_sign_training = xgb.DMatrix(as.matrix(training_data[,unlist(zip_sign_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgZIP_sign_training, "base_margin", log(training_data$ecy))
xgZIP_sign_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(zip_sign_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgZIP_sign_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgZIP_sign_training
)
training_data$zip_sign = predict(xgb, xgZIP_sign_training)
testing_data$zip_sign = predict(xgb, xgZIP_sign_testing)
As an accuracy measure I use Normalized Weighted Gini:
Testing_Gini <- data.frame ("All" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$zip_all,testing_data$ecy)
,"Significant Only" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$zip_sign,testing_data$ecy)
)
Testing_Gini
Overall, the models are very close to each other. There is a difference in Normalized Weighted Gini which does NOT looks significant.
TBD Does not work with Caret very strightforward and my manual approach is wrong somewhere