Data

dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)

Claim Partial Dependency XGB Classification dataset exported from a Python notebook

pd_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_PartialDependency.csv", header=TRUE)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
##   [1] "modeldata_id"                       
##   [2] "systemidstart"                      
##   [3] "systemidend"                        
##   [4] "cal_year"                           
##   [5] "startdate"                          
##   [6] "enddate"                            
##   [7] "startdatetm"                        
##   [8] "enddatetm"                          
##   [9] "ecy"                                
##  [10] "log_ecy"                            
##  [11] "policynumber"                       
##  [12] "policy_uniqueid"                    
##  [13] "policyterm"                         
##  [14] "policytype"                         
##  [15] "effectivedate"                      
##  [16] "expirationdate"                     
##  [17] "policystate"                        
##  [18] "policyform"                         
##  [19] "persistency"                        
##  [20] "companycd"                          
##  [21] "carriercd"                          
##  [22] "agency_group"                       
##  [23] "producername"                       
##  [24] "territory"                          
##  [25] "risknumber"                         
##  [26] "risktype"                           
##  [27] "yearbuilt"                          
##  [28] "log_yearbuilt"                      
##  [29] "sqft"                               
##  [30] "log_sqft"                           
##  [31] "stories"                            
##  [32] "roofcd"                             
##  [33] "roofcd_encd"                        
##  [34] "units"                              
##  [35] "occupancycd"                        
##  [36] "occupancy_encd"                     
##  [37] "allperilded"                        
##  [38] "waterded"                           
##  [39] "protectionclass"                    
##  [40] "constructioncd"                     
##  [41] "constructioncd_encd"                
##  [42] "fire_risk_model_score"              
##  [43] "multipolicyind"                     
##  [44] "multipolicyindumbrella"             
##  [45] "earthquakeumbrellaind"              
##  [46] "usagetype"                          
##  [47] "usagetype_encd"                     
##  [48] "ordinanceorlawpct"                  
##  [49] "functionalreplacementcost"          
##  [50] "homegardcreditind"                  
##  [51] "sprinklersystem"                    
##  [52] "landlordind"                        
##  [53] "rentersinsurance"                   
##  [54] "firealarmtype"                      
##  [55] "burglaryalarmtype"                  
##  [56] "waterdetectiondevice"               
##  [57] "neighborhoodcrimewatchind"          
##  [58] "propertymanager"                    
##  [59] "safeguardplusind"                   
##  [60] "kitchenfireextinguisherind"         
##  [61] "gatedcommunityind"                  
##  [62] "deadboltind"                        
##  [63] "poolind"                            
##  [64] "replacementcostdwellingind"         
##  [65] "replacementvalueind"                
##  [66] "serviceline"                        
##  [67] "equipmentbreakdown"                 
##  [68] "numberoffamilies"                   
##  [69] "insuredage"                         
##  [70] "maritalstatus"                      
##  [71] "insurancescore"                     
##  [72] "overriddeninsurancescore"           
##  [73] "insurancescorevalue"                
##  [74] "insscoretiervalueband"              
##  [75] "financialstabilitytier"             
##  [76] "allcov_wp"                          
##  [77] "cova_wp"                            
##  [78] "cova_ep"                            
##  [79] "cova_deductible"                    
##  [80] "log_cova_deductible"                
##  [81] "cova_limit"                         
##  [82] "log_cova_limit"                     
##  [83] "cova_ic_nc_water"                   
##  [84] "hasclaim"                           
##  [85] "cova_il_nc_water"                   
##  [86] "log_cova_il_nc_water"               
##  [87] "water_risk_3_blk"                   
##  [88] "log_water_risk_3_blk"               
##  [89] "water_risk_fre_3_blk"               
##  [90] "log_water_risk_fre_3_blk"           
##  [91] "water_risk_sev_3_blk"               
##  [92] "log_water_risk_sev_3_blk"           
##  [93] "appl_fail_3_blk"                    
##  [94] "fixture_leak_3_blk"                 
##  [95] "pipe_froze_3_blk"                   
##  [96] "plumb_leak_3_blk"                   
##  [97] "rep_cost_3_blk"                     
##  [98] "ustructure_fail_3_blk"              
##  [99] "waterh_fail_3_blk"                  
## [100] "loaddate"                           
## [101] "customer_cnt_active_policies"       
## [102] "customer_cnt_active_policies_binned"
## [103] "cova_ic_nc_water_color"

Quick Overview

#dataset <- dataset[,all]
str(dataset)
## 'data.frame':    1990928 obs. of  103 variables:
##  $ modeldata_id                       : int  534254 1793227 880091 1653118 353309 1289956 305141 1115709 2075469 1924235 ...
##  $ systemidstart                      : int  1219556 7403270 2370713 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
##  $ systemidend                        : int  1219556 7403270 2955483 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
##  $ cal_year                           : int  2013 2018 2014 2018 2011 2016 2012 2015 2020 2020 ...
##  $ startdate                          : Factor w/ 4288 levels "2009-01-01","2009-01-02",..: 1462 3611 2067 3424 1030 2878 1096 2544 4092 4018 ...
##  $ enddate                            : Factor w/ 4619 levels "2009-01-09","2009-01-12",..: 1553 3620 2159 3431 1044 2890 1225 2524 4351 4156 ...
##  $ startdatetm                        : Factor w/ 23690 levels "2008-01-09 00:00:00",..: 6832 20119 11522 19085 5616 15737 4639 13892 22802 21286 ...
##  $ enddatetm                          : Factor w/ 16455 levels "2009-01-09 00:00:00",..: 5395 14950 8349 12630 3383 11613 4104 10087 16261 15864 ...
##  $ ecy                                : num  0.339 0.115 0.342 0.11 0.129 ...
##  $ log_ecy                            : num  -1.08 -2.16 -1.07 -2.21 -2.05 ...
##  $ policynumber                       : Factor w/ 244668 levels "AZF0082147","AZF0221975",..: 33157 192754 185786 139108 2424 15149 183739 163023 99696 200055 ...
##  $ policy_uniqueid                    : int  445056 1541369 786905 1427784 348649 1124000 292060 976554 1776548 1654555 ...
##  $ policyterm                         : int  4 7 4 10 1 3 1 7 4 6 ...
##  $ policytype                         : Factor w/ 2 levels "New","Renewal": 2 2 2 2 1 2 1 2 2 2 ...
##  $ effectivedate                      : Factor w/ 4329 levels "2008-01-09","2008-01-14",..: 1264 3654 2110 3467 1073 2921 935 2587 4135 3866 ...
##  $ expirationdate                     : Factor w/ 4450 levels "2009-01-09","2009-01-14",..: 1385 3775 2231 3588 1195 3042 1057 2709 4256 3987 ...
##  $ policystate                        : Factor w/ 3 levels "AZ","CA","NV": 2 2 2 2 1 1 2 2 2 2 ...
##  $ policyform                         : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 9 9 9 2 9 9 9 2 9 ...
##  $ persistency                        : int  3 6 3 25 0 2 0 7 3 5 ...
##  $ companycd                          : int  1 17 17 1 1 16 17 17 17 1 ...
##  $ carriercd                          : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 1 2 2 2 1 ...
##  $ agency_group                       : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 446 582 89 50 265 265 378 426 426 426 ...
##  $ producername                       : Factor w/ 1269 levels "1ST CENTURY INS SVCS INC.",..: 982 1224 142 78 452 452 738 888 947 947 ...
##  $ territory                          : Factor w/ 10 levels "","AZ-A","AZ-T",..: 8 6 6 4 6 6 8 7 7 7 ...
##  $ risknumber                         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ risktype                           : Factor w/ 2 levels "Dwelling","Homeowners": 1 2 2 2 1 2 2 2 1 2 ...
##  $ yearbuilt                          : int  1986 1986 1994 1951 1974 1997 1990 1958 2005 1984 ...
##  $ log_yearbuilt                      : num  7.59 7.59 7.6 7.58 7.59 ...
##  $ sqft                               : int  1500 2200 2600 1100 1200 1400 1700 1100 2700 1300 ...
##  $ log_sqft                           : num  7.33 7.73 7.87 7.04 7.12 ...
##  $ stories                            : int  1 2 2 1 1 1 1 1 2 2 ...
##  $ roofcd                             : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 4 1 1 6 1 6 1 ...
##  $ roofcd_encd                        : int  7 7 7 6 8 8 7 8 7 8 ...
##  $ units                              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ occupancycd                        : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ occupancy_encd                     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ allperilded                        : int  500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
##  $ waterded                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ protectionclass                    : int  3 2 3 3 2 2 4 3 4 4 ...
##  $ constructioncd                     : Factor w/ 5 levels "AF","B","F","M",..: 3 1 1 3 2 5 3 3 1 1 ...
##  $ constructioncd_encd                : int  5 4 4 5 3 2 5 5 4 4 ...
##  $ fire_risk_model_score              : int  0 2 0 0 -1 -1 2 0 1 2 ...
##  $ multipolicyind                     : int  0 0 0 1 0 0 1 0 0 0 ...
##  $ multipolicyindumbrella             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ earthquakeumbrellaind              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ usagetype                          : Factor w/ 7 levels "COC","PRIMARY",..: 3 2 2 2 3 2 2 2 3 2 ...
##  $ usagetype_encd                     : int  6 7 7 7 6 7 7 7 6 7 ...
##  $ ordinanceorlawpct                  : int  10 20 0 10 0 0 0 0 10 10 ...
##  $ functionalreplacementcost          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ homegardcreditind                  : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ sprinklersystem                    : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ landlordind                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rentersinsurance                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ firealarmtype                      : int  0 0 1 1 1 1 1 0 1 1 ...
##  $ burglaryalarmtype                  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ waterdetectiondevice               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ neighborhoodcrimewatchind          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ propertymanager                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ safeguardplusind                   : int  0 1 0 0 0 0 0 1 1 0 ...
##  $ kitchenfireextinguisherind         : int  1 1 1 0 0 0 0 1 0 0 ...
##  $ gatedcommunityind                  : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ deadboltind                        : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ poolind                            : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ replacementcostdwellingind         : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ replacementvalueind                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ serviceline                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ equipmentbreakdown                 : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ numberoffamilies                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ insuredage                         : int  66 61 39 92 79 36 72 59 55 69 ...
##  $ maritalstatus                      : Factor w/ 5 levels "~","Divorced",..: 1 3 3 1 4 4 4 1 3 3 ...
##  $ insurancescore                     : Factor w/ 3842 levels "(DOES","~","610",..: 2 2 2 2 2 3809 2 2 2 2 ...
##  $ overriddeninsurancescore           : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 13 1 1 1 1 ...
##  $ insurancescorevalue                : Factor w/ 178 levels "~","605","626",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insscoretiervalueband              : Factor w/ 24 levels "~","599-606",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ financialstabilitytier             : Factor w/ 42 levels "","~","01","02",..: 1 2 1 1 1 1 1 1 2 2 ...
##  $ allcov_wp                          : int  1029 1938 915 915 336 733 687 667 1532 712 ...
##  $ cova_wp                            : int  1029 1342 1732 1721 282 997 973 660 1233 1645 ...
##  $ cova_ep                            : num  349.2 154.2 592.7 188.4 36.3 ...
##  $ cova_deductible                    : int  500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
##  $ log_cova_deductible                : num  6.21 6.91 7.82 6.91 6.91 ...
##  $ cova_limit                         : int  600000 600000 600000 400000 200000 200000 300000 300000 700000 400000 ...
##  $ log_cova_limit                     : num  13.2 13.3 13.2 12.7 11.7 ...
##  $ cova_ic_nc_water                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hasclaim                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cova_il_nc_water                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_cova_il_nc_water               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ water_risk_3_blk                   : int  230 329 213 94 234 183 329 128 171 237 ...
##  $ log_water_risk_3_blk               : num  5.44 5.8 5.36 4.54 5.46 ...
##  $ water_risk_fre_3_blk               : int  152 218 134 53 256 226 218 83 159 180 ...
##  $ log_water_risk_fre_3_blk           : num  5.02 5.38 4.9 3.97 5.55 ...
##  $ water_risk_sev_3_blk               : int  156 155 163 182 94 83 155 159 110 136 ...
##  $ log_water_risk_sev_3_blk           : num  5.05 5.04 5.09 5.2 4.54 ...
##  $ appl_fail_3_blk                    : int  5 4 1 5 5 5 5 2 5 5 ...
##  $ fixture_leak_3_blk                 : int  4 2 1 0 3 5 2 2 0 1 ...
##  $ pipe_froze_3_blk                   : int  0 0 0 2 0 0 2 2 2 2 ...
##  $ plumb_leak_3_blk                   : int  5 5 5 1 5 4 4 4 1 4 ...
##  $ rep_cost_3_blk                     : int  5 5 5 5 5 4 5 5 5 5 ...
##  $ ustructure_fail_3_blk              : int  5 5 5 3 5 5 5 5 1 5 ...
##  $ waterh_fail_3_blk                  : int  2 2 2 0 0 1 2 0 2 1 ...
##   [list output truncated]
summary(dataset)
##   modeldata_id     systemidstart      systemidend         cal_year   
##  Min.   :      1   Min.   :      2   Min.   :      3   Min.   :2009  
##  1st Qu.: 544975   1st Qu.:1251218   1st Qu.:1348961   1st Qu.:2013  
##  Median :1092584   Median :3273967   Median :3422898   Median :2016  
##  Mean   :1089045   Mean   :3880098   Mean   :3980904   Mean   :2015  
##  3rd Qu.:1634112   3rd Qu.:6388617   3rd Qu.:6562387   3rd Qu.:2018  
##  Max.   :2178777   Max.   :9657386   Max.   :9657502   Max.   :2021  
##                                                                      
##       startdate             enddate                     startdatetm     
##  2019-01-01: 113585   2019-01-01: 113556   2018-06-01 00:00:00:   1707  
##  2018-01-01: 108834   2018-01-01: 108763   2017-09-01 00:00:00:   1557  
##  2020-01-01:  96348   2020-01-01:  96432   2018-07-01 00:00:00:   1547  
##  2017-01-01:  88488   2017-01-01:  88375   2017-07-01 00:00:00:   1508  
##  2016-01-01:  76457   2016-01-01:  76443   2018-08-01 00:00:00:   1454  
##  2013-01-01:  74667   2013-01-01:  74676   2018-09-01 00:00:00:   1435  
##  (Other)   :1432549   (Other)   :1432683   (Other)            :1981720  
##                enddatetm            ecy            log_ecy         
##  2019-06-01 00:00:00:   2013   Min.   :0.0027   Min.   :-5.914504  
##  2018-06-01 00:00:00:   1893   1st Qu.:0.2464   1st Qu.:-1.400799  
##  2019-07-01 00:00:00:   1887   Median :0.4709   Median :-0.753110  
##  2019-08-01 00:00:00:   1842   Mean   :0.4794   Mean   :-1.020831  
##  2018-07-01 00:00:00:   1829   3rd Qu.:0.7091   3rd Qu.:-0.343759  
##  2018-09-01 00:00:00:   1802   Max.   :1.0020   Max.   : 0.001998  
##  (Other)            :1979662                                       
##      policynumber     policy_uniqueid     policyterm       policytype     
##  CAF0389924:    201   Min.   :      1   Min.   : 1.000   New    : 474221  
##  CAF0461789:    192   1st Qu.: 493706   1st Qu.: 2.000   Renewal:1516707  
##  CAF0475516:    168   Median : 957578   Median : 3.000                    
##  CAF0464778:    144   Mean   : 944610   Mean   : 3.799                    
##  CAF0393082:    130   3rd Qu.:1407795   3rd Qu.: 5.000                    
##  CAF0468201:    120   Max.   :1854593   Max.   :13.000                    
##  (Other)   :1989973                                                       
##     effectivedate        expirationdate    policystate 
##  2018-06-01:   2058   2019-06-01:   2058   AZ: 148332  
##  2017-07-01:   1945   2018-07-01:   1943   CA:1775912  
##  2017-09-01:   1923   2018-09-01:   1923   NV:  66684  
##  2018-07-01:   1914   2019-07-01:   1913               
##  2017-06-01:   1901   2018-06-01:   1904               
##  2018-08-01:   1898   2019-08-01:   1892               
##  (Other)   :1979289   (Other)   :1979295               
##        policyform       persistency        companycd      carriercd      
##  HO3        :1011501   Min.   :  0.000   Min.   : 1.00   CSEICO: 687722  
##  DF3        : 813757   1st Qu.:  1.000   1st Qu.: 1.00   CSESG :1303206  
##  DF6        :  84023   Median :  3.000   Median :17.00                   
##  Form3      :  39639   Mean   :  6.081   Mean   :12.08                   
##  FL1-Vacant :  22661   3rd Qu.:  8.000   3rd Qu.:17.00                   
##  FL3-Special:  10314   Max.   :103.000   Max.   :19.00                   
##  (Other)    :   9033                                                     
##                                        agency_group    
##  WESTERN GOLD INS AGCY INC.                  : 235856  
##  J.E. BROWN and ASSOCS INS SVCS              :  87275  
##  PIIB - PACIFIC INTERSTATE INS               :  63703  
##  CRUSBERG DECKER INS SVCS INC                :  61736  
##  ISU INSURANCE SERVICES OF  SAN FRANCISCO INC:  53588  
##  Acrisure of California                      :  47783  
##  (Other)                                     :1440987  
##                          producername       territory     
##  WESTERN GOLD INS AGCY INC.    : 234240   CA-B   :602884  
##  J.E. BROWN and ASSOCS INS SVCS:  82834   CA-C   :410013  
##  CRUSBERG DECKER INS SVCS INC  :  40881   CA-O   :349951  
##  TOMM and BUCK INSURANCE SRVCS :  23599   CA-A   :301284  
##  BICHLMEIER INSURANCE SRVS INC :  23396   AZ-A   :131032  
##  VALLEY WEST FINANCIAL INS SVC :  22406   CA-T   :125223  
##  (Other)                       :1563572   (Other): 70541  
##    risknumber           risktype         yearbuilt    log_yearbuilt  
##  Min.   : 0.000   Dwelling  : 939788   Min.   :1900   Min.   :7.523  
##  1st Qu.: 1.000   Homeowners:1051140   1st Qu.:1959   1st Qu.:7.580  
##  Median : 1.000                        Median :1979   Median :7.590  
##  Mean   : 1.006                        Mean   :1976   Mean   :7.589  
##  3rd Qu.: 1.000                        3rd Qu.:1996   3rd Qu.:7.599  
##  Max.   :16.000                        Max.   :2019   Max.   :7.611  
##                                                                      
##       sqft         log_sqft        stories           roofcd      
##  Min.   : 800   Min.   :6.397   Min.   :1.000   COMPO   :997908  
##  1st Qu.:1300   1st Qu.:7.177   1st Qu.:1.000   MEMBRANE: 23403  
##  Median :1700   Median :7.438   Median :1.000   METAL   :  5135  
##  Mean   :1873   Mean   :7.470   Mean   :1.208   OTHER   :234545  
##  3rd Qu.:2300   3rd Qu.:7.749   3rd Qu.:1.000   TAR     : 45071  
##  Max.   :5000   Max.   :9.210   Max.   :3.000   TILE    :666814  
##                                                 WOOD    : 18052  
##   roofcd_encd       units            occupancycd      occupancy_encd 
##  Min.   :1.00   Min.   :1.000   NO         :      5   Min.   :1.000  
##  1st Qu.:7.00   1st Qu.:1.000   OCCUPIEDNOW:1856970   1st Qu.:1.000  
##  Median :8.00   Median :1.000   TENANT     : 133953   Median :1.000  
##  Mean   :7.23   Mean   :1.125                         Mean   :1.067  
##  3rd Qu.:8.00   3rd Qu.:1.000                         3rd Qu.:1.000  
##  Max.   :8.00   Max.   :4.000                         Max.   :3.000  
##                                                                      
##   allperilded       waterded        protectionclass  constructioncd 
##  Min.   :    0   Min.   :    0.00   Min.   : 0.000   AF   : 801773  
##  1st Qu.: 1000   1st Qu.:    0.00   1st Qu.: 2.000   B    :  22461  
##  Median : 1000   Median :    0.00   Median : 3.000   F    :1121154  
##  Mean   : 1403   Mean   :   94.63   Mean   : 2.947   M    :  22588  
##  3rd Qu.: 2500   3rd Qu.:    0.00   3rd Qu.: 4.000   OTHER:  22952  
##  Max.   :10000   Max.   :10000.00   Max.   :10.000                  
##                                                                     
##  constructioncd_encd fire_risk_model_score multipolicyind  
##  Min.   :1.000       Min.   :-1.0000       Min.   :0.0000  
##  1st Qu.:4.000       1st Qu.: 0.0000       1st Qu.:0.0000  
##  Median :5.000       Median : 0.0000       Median :0.0000  
##  Mean   :4.495       Mean   : 0.2137       Mean   :0.1617  
##  3rd Qu.:5.000       3rd Qu.: 0.0000       3rd Qu.:0.0000  
##  Max.   :5.000       Max.   :18.0000       Max.   :1.0000  
##                                                            
##  multipolicyindumbrella earthquakeumbrellaind      usagetype      
##  Min.   :0.000000       Min.   :0.000000      COC       :  11503  
##  1st Qu.:0.000000       1st Qu.:0.000000      PRIMARY   :1107959  
##  Median :0.000000       Median :0.000000      RENTAL    : 848301  
##  Mean   :0.005191       Mean   :0.004436      SEASONAL  :   8397  
##  3rd Qu.:0.000000       3rd Qu.:0.000000      SECONDARY :   3601  
##  Max.   :1.000000       Max.   :1.000000      UNOCCUPIED:    321  
##                                               VACANT    :  10846  
##  usagetype_encd  ordinanceorlawpct functionalreplacementcost
##  Min.   :1.000   Min.   :  0.000   Min.   :0.000000         
##  1st Qu.:6.000   1st Qu.:  0.000   1st Qu.:0.000000         
##  Median :7.000   Median : 10.000   Median :0.000000         
##  Mean   :6.519   Mean   :  9.887   Mean   :0.001915         
##  3rd Qu.:7.000   3rd Qu.: 10.000   3rd Qu.:0.000000         
##  Max.   :7.000   Max.   :100.000   Max.   :1.000000         
##                                                             
##  homegardcreditind sprinklersystem    landlordind      rentersinsurance  
##  Min.   :0.0000    Min.   :0.00000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.0000    1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.0000    Median :0.00000   Median :0.00000   Median :0.000000  
##  Mean   :0.1413    Mean   :0.03199   Mean   :0.08028   Mean   :0.005884  
##  3rd Qu.:0.0000    3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.0000    Max.   :1.00000   Max.   :1.00000   Max.   :1.000000  
##                                                                          
##  firealarmtype   burglaryalarmtype waterdetectiondevice
##  Min.   :0.000   Min.   :0.0000    Min.   :0.0000000   
##  1st Qu.:0.000   1st Qu.:0.0000    1st Qu.:0.0000000   
##  Median :1.000   Median :0.0000    Median :0.0000000   
##  Mean   :0.627   Mean   :0.3655    Mean   :0.0002401   
##  3rd Qu.:1.000   3rd Qu.:1.0000    3rd Qu.:0.0000000   
##  Max.   :1.000   Max.   :1.0000    Max.   :1.0000000   
##                                                        
##  neighborhoodcrimewatchind propertymanager   safeguardplusind
##  Min.   :0.00000           Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000           Median :0.00000   Median :0.0000  
##  Mean   :0.01387           Mean   :0.02075   Mean   :0.3691  
##  3rd Qu.:0.00000           3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.00000   Max.   :1.0000  
##                                                              
##  kitchenfireextinguisherind gatedcommunityind  deadboltind    
##  Min.   :0.0000             Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000             1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000             Median :0.00000   Median :1.0000  
##  Mean   :0.3979             Mean   :0.01435   Mean   :0.7198  
##  3rd Qu.:1.0000             3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000             Max.   :1.00000   Max.   :1.0000  
##                                                               
##     poolind        replacementcostdwellingind replacementvalueind
##  Min.   :0.00000   Min.   :0.0000             Min.   :0.0000     
##  1st Qu.:0.00000   1st Qu.:0.0000             1st Qu.:0.0000     
##  Median :0.00000   Median :0.0000             Median :0.0000     
##  Mean   :0.03559   Mean   :0.3573             Mean   :0.0168     
##  3rd Qu.:0.00000   3rd Qu.:1.0000             3rd Qu.:0.0000     
##  Max.   :1.00000   Max.   :1.0000             Max.   :1.0000     
##                                                                  
##   serviceline     equipmentbreakdown numberoffamilies   insuredage    
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.000    Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:1.000    1st Qu.: 46.00  
##  Median :0.0000   Median :0.0000     Median :1.000    Median : 57.00  
##  Mean   :0.1171   Mean   :0.1198     Mean   :1.126    Mean   : 56.41  
##  3rd Qu.:0.0000   3rd Qu.:0.0000     3rd Qu.:1.000    3rd Qu.: 66.00  
##  Max.   :1.0000   Max.   :1.0000     Max.   :4.000    Max.   :177.00  
##                                                       NA's   :236483  
##   maritalstatus    insurancescore    overriddeninsurancescore
##  ~       :794672   ~      :1902377   ~      :1846216         
##  Divorced:  8151   99     :    418   99     :  49428         
##  Married :565124   KQXKD  :    155   07     :   9664         
##  Single  :603723   ZKKWS  :    119   12     :   8023         
##  Widowed : 19258   KFWZV  :    108   04     :   6305         
##                    XXQKS  :    101   06     :   5842         
##                    (Other):  87650   (Other):  65450         
##  insurancescorevalue insscoretiervalueband financialstabilitytier
##  ~      :1990011     ~      :1990011              :1740132       
##  825    :     36     837-865:    107       ~      : 234605       
##  873    :     20     865-880:     94       07     :   1577       
##  769    :     18     748-774:     90       12     :   1337       
##  881    :     15     820-837:     81       04     :   1030       
##  829    :     14     894-945:     73       13     :    970       
##  (Other):    814     (Other):    472       (Other):  11277       
##    allcov_wp        cova_wp           cova_ep        cova_deductible
##  Min.   :   32   Min.   :    5.0   Min.   :-8666.2   Min.   :    0  
##  1st Qu.:  548   1st Qu.:  468.0   1st Qu.:  116.4   1st Qu.: 1000  
##  Median :  777   Median :  755.0   Median :  283.4   Median : 1000  
##  Mean   :  911   Mean   :  973.5   Mean   :  411.2   Mean   : 1403  
##  3rd Qu.: 1105   3rd Qu.: 1231.0   3rd Qu.:  567.6   3rd Qu.: 2500  
##  Max.   :18926   Max.   :14620.0   Max.   :12121.9   Max.   :10000  
##                                                                     
##  log_cova_deductible   cova_limit      log_cova_limit   cova_ic_nc_water  
##  Min.   :0.000       Min.   : 100000   Min.   : 8.294   Min.   :0.000000  
##  1st Qu.:6.908       1st Qu.: 300000   1st Qu.:12.384   1st Qu.:0.000000  
##  Median :6.908       Median : 400000   Median :12.702   Median :0.000000  
##  Mean   :7.018       Mean   : 422080   Mean   :12.689   Mean   :0.005978  
##  3rd Qu.:7.824       3rd Qu.: 500000   3rd Qu.:13.035   3rd Qu.:0.000000  
##  Max.   :9.210       Max.   :1300000   Max.   :14.944   Max.   :3.000000  
##                                                                           
##     hasclaim        cova_il_nc_water   log_cova_il_nc_water
##  Min.   :0.000000   Min.   : -5536.9   Min.   :-0.10536    
##  1st Qu.:0.000000   1st Qu.:     0.0   1st Qu.: 0.00000    
##  Median :0.000000   Median :     0.0   Median : 0.00000    
##  Mean   :0.005844   Mean   :    79.6   Mean   : 0.05128    
##  3rd Qu.:0.000000   3rd Qu.:     0.0   3rd Qu.: 0.00000    
##  Max.   :1.000000   Max.   :522735.2   Max.   :13.16683    
##                                        NA's   :3           
##  water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
##  Min.   :  15.0   Min.   :2.708        Min.   :  15.0      
##  1st Qu.: 134.0   1st Qu.:4.898        1st Qu.: 108.0      
##  Median : 185.0   Median :5.220        Median : 154.0      
##  Mean   : 202.6   Mean   :5.202        Mean   : 169.4      
##  3rd Qu.: 244.0   3rd Qu.:5.497        3rd Qu.: 209.0      
##  Max.   :1491.0   Max.   :7.307        Max.   :2308.0      
##                                                            
##  log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
##  Min.   :2.708            Min.   : 33.0        Min.   :3.497           
##  1st Qu.:4.682            1st Qu.:106.0        1st Qu.:4.663           
##  Median :5.037            Median :125.0        Median :4.828           
##  Mean   :5.014            Mean   :127.7        Mean   :4.821           
##  3rd Qu.:5.342            3rd Qu.:147.0        3rd Qu.:4.990           
##  Max.   :7.744            Max.   :313.0        Max.   :5.746           
##                                                                        
##  appl_fail_3_blk fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
##  Min.   :0.000   Min.   :0.000      Min.   :0.000    Min.   :0.00    
##  1st Qu.:4.000   1st Qu.:1.000      1st Qu.:2.000    1st Qu.:1.00    
##  Median :5.000   Median :2.000      Median :2.000    Median :4.00    
##  Mean   :4.118   Mean   :1.884      Mean   :1.735    Mean   :3.07    
##  3rd Qu.:5.000   3rd Qu.:3.000      3rd Qu.:2.000    3rd Qu.:4.00    
##  Max.   :5.000   Max.   :5.000      Max.   :5.000    Max.   :5.00    
##                                                                      
##  rep_cost_3_blk  ustructure_fail_3_blk waterh_fail_3_blk
##  Min.   :0.000   Min.   :0.000         Min.   :0.000    
##  1st Qu.:5.000   1st Qu.:5.000         1st Qu.:0.000    
##  Median :5.000   Median :5.000         Median :1.000    
##  Mean   :4.797   Mean   :4.441         Mean   :1.154    
##  3rd Qu.:5.000   3rd Qu.:5.000         3rd Qu.:2.000    
##  Max.   :5.000   Max.   :5.000         Max.   :5.000    
##                                                         
##                     loaddate       customer_cnt_active_policies
##  2020-08-07 08:09:37.497:1990928   Min.   :  1.000             
##                                    1st Qu.:  1.000             
##                                    Median :  1.000             
##                                    Mean   :  1.757             
##                                    3rd Qu.:  1.000             
##                                    Max.   :147.000             
##                                                                
##  customer_cnt_active_policies_binned cova_ic_nc_water_color
##  Min.   :  1.000                     Min.   :0.000000      
##  1st Qu.:  1.000                     1st Qu.:0.000000      
##  Median :  1.000                     Median :0.000000      
##  Mean   :  2.936                     Mean   :0.005978      
##  3rd Qu.:  1.000                     3rd Qu.:0.000000      
##  Max.   :150.000                     Max.   :3.000000      
## 

Visualization

Number of claims

dataset$cova_ic_nc_water_color <- as.factor(dataset$cova_ic_nc_water)
ggplot(dataset, aes(x = cova_ic_nc_water, fill=cova_ic_nc_water_color)) +
     geom_bar() +
     labs(x = 'Number of Claims', y = 'Count', title = 'Histogram of Number of Claims') +
     scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red", "2" = "red", "3" = "red")) + 
     geom_text(stat='count', aes(label=..count..), vjust=1)

Since the cases when there are more then 1 claim per exposure are very rare and, I create a new logical attribute HasClaim with values 1 or 0. It can be used in logistic regression but I use it visualize if there is any visual dependency between predictors and claims.

dataset$hasclaim <- as.factor(dataset$hasclaim)
ggplot(dataset, aes(x = hasclaim, fill=hasclaim)) +
     geom_bar() +
     scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red")) + 
     labs(x = 'Adjusted Number of Claims', y = 'Count', title = 'Adjusted Histogram of Number of Claims') +
geom_text(stat='count', aes(label=..count..), vjust=1)

There is cery low percent of water related claims: 0.6%

Predictors

(the order is from XGB Classification feature importance)

ratio <- nrow(dataset)
#grid for multiplots
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots == 1) {
    print(plots[[1]])

  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
#continuous attributes
plot_continuous <- function (col_name) {
  p1 <- ggplot(dataset, aes(x = .data[[col_name]], fill=hasclaim)) +
  geom_histogram(bins=100) +
  scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +   
    
  labs(x = col_name, y = 'Count', title = paste("Histogram of", col_name))

 #p2 <- ggplot(dataset, aes(x = .data[[col_name]])) +
  #geom_density() + 
 # labs(x = col_name, y = 'Density', title = paste("Density of", col_name))
  
  pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
  
  p2 <- ggplot(pd_col, aes(x = value, y = pd)) +
  geom_line(aes(color="darkred")) + 
 labs(x = col_name, y = 'pd', title = paste("Claim Partial Dependency of", col_name))

  p3 <- ggplot(dataset, aes(x=hasclaim, y=.data[[col_name]], col=hasclaim, fill=hasclaim)) +
  geom_boxplot(notch = TRUE) +
  scale_fill_manual(breaks = c("0","1"), 
                      values=c("#56B4E9","red")) +   
  theme(legend.position = "none") +
  labs(y = col_name, title = paste("Box Plot of", col_name, "with hasclaim"))

  vec <- dataset[[col_name]]
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

  p4 <- ggplot(dataset, aes(sample = .data[[col_name]], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = col_name, title = paste("QQ Plot of", col_name))

  
  multiplot(p1,p2,p3,p4, cols=2)
}
#categorical attributes - multiplot does not work for fun modeling
plot_categorical <- function(col_name) {
  
  if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
} 
  else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
} 
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
} 
  else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
} 
  else {
  col_name_original <- col_name
}
  
  
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')

df <- merge(dataset[c(col_name,'hasclaim')],pd_col,by=col_name)  



if (grepl('encd',col_name) | 
    col_name =='fire_risk_model_score' | 
    col_name =='customer_cnt_active_policies_binned' |
    col_name =='cova_deductible' | 
    col_name =='cova_limit' | 
    col_name =='protectionclass' | 
    col_name =='ordinanceorlawpct' | 
    col_name =='numberoffamilies' | 
    col_name =='waterded' | 
    col_name =='units' | 
    col_name =='stories') {
  XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
  XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)]))) 
} 
else if (col_name =='pipe_froze_3_blk' | 
col_name =='water_risk_3_blk' | 
col_name =='ustructure_fail_3_blk' | 
col_name =='water_risk_fre_3_blk' | 
col_name =='waterh_fail_3_blk' | 
col_name =='rep_cost_3_blk' | 
col_name =='plumb_leak_3_blk' | 
col_name =='appl_fail_3_blk' | 
col_name =='fixture_leak_3_blk') {
  XBreaks <- c(0,1,2,3,4,5)
  XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')    
}
else {
  XBreaks <- c(0,1)
  XLabels <- c("No", "Yes")  

}

cols <- c('PD'='#f04546')
p1 = ggplot(df) +
     geom_bar(aes(x = .data[[col_name]], fill=hasclaim)) +
     scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +   
     geom_line(aes(x = .data[[col_name]], y=(pd)*ratio*100,color='PD')) +
     scale_y_continuous(sec.axis = sec_axis(~./ratio, name = "PD")) +
     scale_x_continuous( breaks=XBreaks, labels= XLabels) +
     labs(x = col_name, y = 'Count', title = paste("Histogram and claims partial dependency of", col_name_original)) +
     theme(axis.text.x = element_text(angle = 90, hjust = 1))


if (length(unique(dataset[[col_name_original]]))  <= 6) 
{p2 = bayesian_plot(data=dataset, input=col_name_original, target='hasclaim')}

p3 = cross_plot(data=dataset, str_input=col_name_original, str_target='hasclaim',plot_type='percentual')


if (length(unique(dataset[[col_name_original]]))  <= 6) {
  print(p1)
  print(p2)
  print(p3)
  }
else {  
  print(p1)
  print(p3)
  }
  
}
plot_categorical('usagetype_encd')  

## NULL

The more property is used, the higher claims rate.

plot_categorical('customer_cnt_active_policies_binned')  

## NULL

If the same customer has more then 10 active policies, the claim rate is lower.

plot_continuous('ecy')  
## Loading required package: grid

The longer the exposure, the higher claim rate.

plot_categorical('cova_deductible')  

## NULL

The claim rate is higher in low deductible policies.

plot_continuous('yearbuilt')  

More claims are in newer houses but not in modern.

plot_categorical('landlordind')  

## NULL

This is a discount based on the number of policies for the same customer. It’s correlated with customer_cnt_active_policies_binned and has the same claim dependency but without details: more policies less claims rate.

plot_categorical('pipe_froze_3_blk')  

## NULL

Everything indicates we have more claims in the lowest category. It may makes sense for california, where we have most insured properties in an area where low tempretures are rare but not useful for further analysis.

plot_categorical('roofcd_encd')  

## NULL

Visible higher claim rate in WOOD, TILE, and maybe, TAR and OTHER

plot_continuous('sqft')  

The higher sqft, the higher claim rate till some limit, where it is not increased.

plot_categorical('firealarmtype')  

## NULL

Firealarmtype is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.

plot_categorical('stories')  

## NULL

There is a clear evidence, the 2-stories properties have more claims. However, the attributes is broken in the database. There are a lot of empty or high values (known issues).

plot_categorical('equipmentbreakdown')  

## NULL

Equipmentbreakdown is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.

plot_categorical('cova_limit')  

## NULL

More claims from more expensive properties.

plot_categorical('replacementvalueind')  

## NULL

Replacementvalueind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes.

plot_categorical('propertymanager')  

## NULL

Even if PropertyManager decrease number of claims according to visual analysis it is not clear from the partial dependency. Probably due to correlations.

plot_categorical('multipolicyind')  

## NULL

On the one hand, there are more claims in “Yes” multipolicyind category, on the other, it’s different in the partial dependency. The predictor is not very significant in GLM

plot_categorical('poolind')  

## NULL

More claims in properties with pools according to the charts and baysian comparizon but it’s different in the partial dependency. GLM results are more close to the visual and significant. The difference between “No” and “Yes” categories is very small and can be due to errors.

plot_categorical('replacementcostdwellingind')  

## NULL

Replacementcostdwellingind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency.

plot_categorical('safeguardplusind')  

## NULL

Safeguardplusind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency and charts.

plot_categorical('ustructure_fail_3_blk')  

## NULL

There are slight increase in the “Below Average” category partial dependency but not in visual.

plot_continuous('water_risk_3_blk')  

The higher the score, the more claims according to box-plots and partial dependency.

The rest of the predictors are not very important or directly related to water claims.

plot_categorical('serviceline')  

## NULL
plot_categorical('protectionclass')  

## NULL
plot_continuous('water_risk_fre_3_blk')  

plot_categorical('rep_cost_3_blk')  

## NULL
plot_categorical('waterh_fail_3_blk')  

## NULL
plot_categorical('deadboltind')  

## NULL
plot_categorical('homegardcreditind')  

## NULL
plot_categorical('ordinanceorlawpct')  

## NULL
plot_categorical('occupancy_encd')  

## NULL
plot_categorical('burglaryalarmtype')  

## NULL
plot_categorical('waterded')  

## NULL
plot_categorical('plumb_leak_3_blk')  

## NULL
plot_categorical('appl_fail_3_blk')  

## NULL
plot_categorical('numberoffamilies')  

## NULL
plot_categorical('units')  

## NULL
plot_continuous('water_risk_sev_3_blk')  

plot_categorical('multipolicyindumbrella')  

## NULL
plot_categorical('kitchenfireextinguisherind')  

## NULL
plot_categorical('fixture_leak_3_blk')  

## NULL
plot_categorical('constructioncd_encd')  

## NULL
plot_categorical('fire_risk_model_score')  

## NULL
plot_categorical('gatedcommunityind')  

## NULL
plot_categorical('sprinklersystem')  

## NULL
plot_categorical('neighborhoodcrimewatchind')  

## NULL
plot_categorical('rentersinsurance')  

## NULL
plot_categorical('earthquakeumbrellaind')  

## NULL
plot_categorical('functionalreplacementcost')  

## NULL

Misc visualization

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=sqft, color=hasclaim)) +
  geom_point() +
  scale_color_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) + 
  scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +
  labs(title = "Sqft vs YearBuilt and HasClaim")

We have more policies between 1945 and 2010 with more claims in this period.

ggplot(dataset, aes(x=roofcd, y=yearbuilt, col=hasclaim, fill=hasclaim)) +
  geom_boxplot(notch = TRUE) +
#scale_color_manual(breaks = c("0","1"), 
#                 values=c("#56B4E9","red")) + 
scale_fill_manual(breaks = c("0","1"), 
                       values=c("#56B4E9","red")) +
  theme(legend.position = "none") +
  labs(x = "Roof Code", title = paste("Box Plot of YearBuilt vs Roof Code"))

Tile and wood is used in more modern houses with lower rate of claims. TAR is used in older. COMPO, TAR and OTHER have more claims then other