library(MASS)
library(lmtest)
library(pscl)
library(xgboost)
library(ggplot2)
library(caret)
source('/home/kate/code/Utils/MyFunctions.R')
ModelsDir <- '/home/kate/Research/Property/Models/'
UseSavedIfExists <- TRUE
training_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_training.csv", header=TRUE)
testing_data <- read.csv("/home/kate/Research/Property/Data/property_wcf_testing.csv", header=TRUE)
NB work with factors and because some numerical columns are factors (ordered) I convert them in factors explicitly. I may need to do this for some other columns which are Yes/No.
training_data$appl_fail_3_blk <- as.factor( training_data$appl_fail_3_blk )
training_data$fixture_leak_3_blk <- as.factor( training_data$fixture_leak_3_blk )
training_data$pipe_froze_3_blk <- as.factor( training_data$pipe_froze_3_blk )
training_data$plumb_leak_3_blk <- as.factor( training_data$plumb_leak_3_blk )
training_data$rep_cost_3_blk <- as.factor( training_data$rep_cost_3_blk )
training_data$ustructure_fail_3_blk <- as.factor( training_data$ustructure_fail_3_blk )
training_data$waterh_fail_3_blk <- as.factor( training_data$waterh_fail_3_blk )
testing_data$appl_fail_3_blk <- as.factor( testing_data$appl_fail_3_blk )
testing_data$fixture_leak_3_blk <- as.factor( testing_data$fixture_leak_3_blk )
testing_data$pipe_froze_3_blk <- as.factor( testing_data$pipe_froze_3_blk )
testing_data$plumb_leak_3_blk <- as.factor( testing_data$plumb_leak_3_blk )
testing_data$rep_cost_3_blk <- as.factor( testing_data$rep_cost_3_blk )
testing_data$ustructure_fail_3_blk <- as.factor( testing_data$ustructure_fail_3_blk )
testing_data$waterh_fail_3_blk <- as.factor( testing_data$waterh_fail_3_blk )
max_yearbuilt <- max(training_data$yearbuilt)
max_sqft <- max(training_data$sqft)
max_water_risk_3_blk <- max(training_data$water_risk_3_blk)
max_water_risk_fre_3_blk <- max(training_data$water_risk_fre_3_blk)
max_water_risk_sev_3_blk <- max(training_data$water_risk_sev_3_blk)
training_data$log_norm_yearbult <- log(training_data$yearbuilt/max_yearbuilt)
training_data$log_norm_sqft <- log(training_data$sqft/max_sqft)
training_data$log_norm_water_risk_3_blk <- log(training_data$water_risk_3_blk/max_water_risk_3_blk)
training_data$log_norm_water_risk_fre_3_blk <- log(training_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
training_data$log_norm_water_risk_sev_3_blk <- log(training_data$sqft/max_water_risk_sev_3_blk)
testing_data$log_norm_yearbult <- log(testing_data$yearbuilt/max_yearbuilt)
testing_data$log_norm_sqft <- log(testing_data$sqft/max_sqft)
testing_data$log_norm_water_risk_3_blk <- log(testing_data$water_risk_3_blk/max_water_risk_3_blk)
testing_data$log_norm_water_risk_fre_3_blk <- log(testing_data$water_risk_fre_3_blk/max_water_risk_fre_3_blk)
testing_data$log_norm_water_risk_sev_3_blk <- log(testing_data$sqft/max_water_risk_sev_3_blk)
training_data$stories <- as.factor(training_data$stories)
training_data$units <- as.factor(training_data$units)
training_data$secondaryresidence <- as.factor(training_data$secondaryresidence)
training_data$sprinklersystem <- as.factor(training_data$sprinklersystem)
training_data$firealarmtype <- as.factor(training_data$firealarmtype)
training_data$waterdetectiondevice <- as.factor(training_data$waterdetectiondevice)
training_data$propertymanager <- as.factor(training_data$propertymanager)
training_data$cova_deductible <- as.factor(training_data$cova_deductible)
training_data$cova_limit <- as.factor(training_data$cova_limit )
training_data$waterded <- as.factor(training_data$waterded)
training_data$protectionclass <- as.factor(training_data$protectionclass)
training_data$fire_risk_model_score <- as.factor(training_data$fire_risk_model_score)
training_data$ordinanceorlawpct <- as.factor(training_data$ordinanceorlawpct)
training_data$functionalreplacementcost <- as.factor(training_data$functionalreplacementcost)
training_data$homegardcreditind <- as.factor(training_data$homegardcreditind)
training_data$landlordind <- as.factor(training_data$landlordind)
training_data$rentersinsurance <- as.factor(training_data$rentersinsurance)
training_data$burglaryalarmtype <- as.factor(training_data$burglaryalarmtype)
training_data$propertymanager <- as.factor(training_data$propertymanager)
testing_data$stories <- as.factor(testing_data$stories)
testing_data$units <- as.factor(testing_data$units)
testing_data$secondaryresidence <- as.factor(testing_data$secondaryresidence)
testing_data$sprinklersystem <- as.factor(testing_data$sprinklersystem)
testing_data$firealarmtype <- as.factor(testing_data$firealarmtype)
testing_data$waterdetectiondevice <- as.factor(testing_data$waterdetectiondevice)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
testing_data$cova_deductible <- as.factor(testing_data$cova_deductible)
testing_data$cova_limit <- as.factor(testing_data$cova_limit )
testing_data$waterded <- as.factor(testing_data$waterded)
testing_data$protectionclass <- as.factor(testing_data$protectionclass)
testing_data$fire_risk_model_score <- as.factor(testing_data$fire_risk_model_score)
testing_data$ordinanceorlawpct <- as.factor(testing_data$ordinanceorlawpct)
testing_data$functionalreplacementcost <- as.factor(testing_data$functionalreplacementcost)
testing_data$homegardcreditind <- as.factor(testing_data$homegardcreditind)
testing_data$landlordind <- as.factor(testing_data$landlordind)
testing_data$rentersinsurance <- as.factor(testing_data$rentersinsurance)
testing_data$burglaryalarmtype <- as.factor(testing_data$burglaryalarmtype)
testing_data$propertymanager <- as.factor(testing_data$propertymanager)
#unique numclaims values
numclaims <- unique(training_data$cova_ic_nc_water)
formula_all <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
roofcd +
units +
occupancycd +
waterded +
protectionclass +
constructioncd +
#fire_risk_model_score +
#usagetype +
secondaryresidence +
ordinanceorlawpct +
functionalreplacementcost +
homegardcreditind +
sprinklersystem +
landlordind +
rentersinsurance +
firealarmtype +
burglaryalarmtype +
waterdetectiondevice +
propertymanager +
#cova_deductible +
cova_limit +
log_norm_water_risk_3_blk +
log_norm_water_risk_fre_3_blk +
#log_norm_water_risk_sev_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk +
rep_cost_3_blk +
ustructure_fail_3_blk +
waterh_fail_3_blk
ModelFile <- paste(ModelsDir,"hurdlepoisson_all.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
hurdlepoisson_all <- readRDS(ModelFile)
} else {
hurdlepoisson_all <- hurdle(formula_all, data = training_data, offset=log_ecy,dist ="poisson",zero.dist = "poisson",link= "logit")
saveRDS(hurdlepoisson_all, ModelFile)
}
summary(hurdlepoisson_all)
when fire risk is added - kernel died, looks like to many predictors usagetype - all NANs secondaryresidence is Ok cova_deductible - all NANs log_norm_water_risk_sev_3_blk - Error in optim(fn = countDist, gr = countGrad, par = c(start$count, if (dist == : non-finite value supplied by optim Is the error related to a specific predictor or a "summarized" effect?
Normalization is needed to make the model work. I was not able to add into the model and test cova_deductible and log_norm_water_risk_sev_3_blk. It returns errors. usagetype returns NA in covariance matrix for some levels. Some predictors (like ordinanceorlawpct) are marked as significant but really it's a random component I feel.
formula_sign <- cova_ic_nc_water ~
log_norm_yearbult +
log_norm_sqft +
stories +
roofcd +
units +
waterded +
protectionclass +
constructioncd +
sprinklersystem +
landlordind +
firealarmtype +
cova_limit +
log_norm_water_risk_fre_3_blk +
appl_fail_3_blk +
fixture_leak_3_blk +
pipe_froze_3_blk +
plumb_leak_3_blk +
waterh_fail_3_blk
ModelFile <- paste(ModelsDir,"hurdlepoisson_sign.rds",sep="")
if(file.exists(ModelFile) && UseSavedIfExists){
hurdlepoisson_sign <- readRDS(ModelFile)
} else {
hurdlepoisson_sign <- hurdle(formula_sign, data = training_data, offset=log_ecy,dist ="poisson",zero.dist = "poisson",link= "logit")
saveRDS(hurdlepoisson_sign, ModelFile)
}
summary(hurdlepoisson_sign)
Covariance matrix estimates are definetly better then in the complex model
lrtest(hurdlepoisson_sign,hurdlepoisson_all)
vuong(hurdlepoisson_sign,hurdlepoisson_all)
The results is a little bit contraversial. According to Raw, the complex model is better then the nested one. However Significant features model is better because it has smaller AIC and BIC and it's significant.
models <- list("All Features" = hurdlepoisson_all, "Sign Features"=hurdlepoisson_sign)
#Log likelyhood for all the models and DF
Models_LogLik <- rbind(logLik = sapply(models, function(x) round(logLik(x), digits = 0)),
Df = sapply(models, function(x) attr(logLik(x), "df")))
Models_LogLik
#AIC
Models_AIC <- rbind(AIC = sapply(models, function(x) round(AIC(x), digits = 0)))
Models_AIC
#BIC
Models_BIC <- rbind(BIC = sapply(models, function(x) round(BIC(x), digits = 0)))
Models_BIC
Smaller AIC (BIC) suggests a better model. The model with significant features has a smaller AIC and BIC then the model with all features.
sapply(models, function(x) coef(x)[1:20])
The models agree on the sign of coefficients
library(countreg)
par(mfrow = c(1, 2))
rootogram(hurdlepoisson_all,max = max(numclaims),main="All Features",style="hanging")
rootogram(hurdlepoisson_sign,max = max(numclaims),main="Significant Features",style="hanging")
The models look similar, without a deviance at number claims 2 point.
I use XGB to predict the final result based on probabilities of number claims in each row.
param0 <- list(
"objective" = "count:poisson"
, "eval_metric" = "poisson-nloglik"
)
#Adding predicted probability values
#to training and testing dataset for each numclaims
#as separate column
training_all <- predict(hurdlepoisson_all, training_data, type='prob')
testing_all <- predict(hurdlepoisson_all, testing_data, type='prob')
training_sign <- predict(hurdlepoisson_sign, training_data, type='prob')
testing_sign <- predict(hurdlepoisson_sign, testing_data, type='prob')
for(unique_value in numclaims){
training_data[paste("hurdlepoisson_all", unique_value, sep = ".")] <- training_all[,as.numeric(unique_value)+1]
testing_data[paste("hurdlepoisson_all", unique_value, sep = ".")] <- testing_all[,as.numeric(unique_value)+1]
training_data[paste("hurdlepoisson_sign", unique_value, sep = ".")] <- training_sign[,as.numeric(unique_value)+1]
testing_data[paste("hurdlepoisson_sign", unique_value, sep = ".")] <- testing_sign[,as.numeric(unique_value)+1]
}
#Features with probabilities listed by model
HP_all_features <- list();
HP_sign_features <- list();
i <- 1
for(unique_value in numclaims){
HP_all_features[[i]] <- paste("hurdlepoisson_all", unique_value, sep = ".")
HP_sign_features[[i]] <- paste("hurdlepoisson_sign", unique_value, sep = ".")
i <- i + 1
}
xgHP_all_training = xgb.DMatrix(as.matrix(training_data[,unlist(HP_all_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgHP_all_training, "base_margin", log(training_data$ecy))
xgHP_all_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(HP_all_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgHP_all_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgHP_all_training
)
training_data$hurdlepoisson_all = predict(xgb, xgHP_all_training)
testing_data$hurdlepoisson_all = predict(xgb, xgHP_all_testing)
xgHP_sign_training = xgb.DMatrix(as.matrix(training_data[,unlist(HP_sign_features)]), label = training_data$cova_ic_nc_water)
setinfo(xgHP_sign_training, "base_margin", log(training_data$ecy))
xgHP_sign_testing = xgb.DMatrix(as.matrix(testing_data[,unlist(HP_sign_features)]), label = testing_data$cova_ic_nc_water)
setinfo(xgHP_sign_testing, "base_margin", log(testing_data$ecy))
xgb = xgb.train(
nrounds = 10000
, params = param0
, data = xgHP_sign_training
)
training_data$hurdlepoisson_sign = predict(xgb, xgHP_sign_training)
testing_data$hurdlepoisson_sign = predict(xgb, xgHP_sign_testing)
As an accuracy measure I use Normalized Weighted Gini:
Testing_Gini <- data.frame ("All" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$hurdlepoisson_all,testing_data$ecy)
,"Significant Only" = NormalizedWeightedGini(testing_data$cova_ic_nc_water,testing_data$hurdlepoisson_sign,testing_data$ecy)
)
Testing_Gini
Overall, the models are very close to each other. There is a difference in Normalized Weighted Gini which maybe significant and indicates Significant Features Only model is better.
TBD Does not work with Caret very strightforward and my manual approach is wrong somewhere