dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)

Claim Partial Dependency XGB Classification dataset exported from a Python notebook

pd_gamma_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_Severity_Gamma_PartialDependency.csv", header=TRUE)
pd_normal_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA _Severity_Normal_PartialDependency.csv", header=TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
##   [1] "modeldata_id"                       
##   [2] "systemidstart"                      
##   [3] "systemidend"                        
##   [4] "cal_year"                           
##   [5] "startdate"                          
##   [6] "enddate"                            
##   [7] "startdatetm"                        
##   [8] "enddatetm"                          
##   [9] "ecy"                                
##  [10] "log_ecy"                            
##  [11] "policynumber"                       
##  [12] "policy_uniqueid"                    
##  [13] "policyterm"                         
##  [14] "policytype"                         
##  [15] "effectivedate"                      
##  [16] "expirationdate"                     
##  [17] "policystate"                        
##  [18] "policyform"                         
##  [19] "persistency"                        
##  [20] "companycd"                          
##  [21] "carriercd"                          
##  [22] "agency_group"                       
##  [23] "producername"                       
##  [24] "territory"                          
##  [25] "risknumber"                         
##  [26] "risktype"                           
##  [27] "yearbuilt"                          
##  [28] "log_yearbuilt"                      
##  [29] "sqft"                               
##  [30] "log_sqft"                           
##  [31] "stories"                            
##  [32] "roofcd"                             
##  [33] "roofcd_encd"                        
##  [34] "units"                              
##  [35] "occupancycd"                        
##  [36] "occupancy_encd"                     
##  [37] "allperilded"                        
##  [38] "waterded"                           
##  [39] "protectionclass"                    
##  [40] "constructioncd"                     
##  [41] "constructioncd_encd"                
##  [42] "fire_risk_model_score"              
##  [43] "multipolicyind"                     
##  [44] "multipolicyindumbrella"             
##  [45] "earthquakeumbrellaind"              
##  [46] "usagetype"                          
##  [47] "usagetype_encd"                     
##  [48] "ordinanceorlawpct"                  
##  [49] "functionalreplacementcost"          
##  [50] "homegardcreditind"                  
##  [51] "sprinklersystem"                    
##  [52] "landlordind"                        
##  [53] "rentersinsurance"                   
##  [54] "firealarmtype"                      
##  [55] "burglaryalarmtype"                  
##  [56] "waterdetectiondevice"               
##  [57] "neighborhoodcrimewatchind"          
##  [58] "propertymanager"                    
##  [59] "safeguardplusind"                   
##  [60] "kitchenfireextinguisherind"         
##  [61] "gatedcommunityind"                  
##  [62] "deadboltind"                        
##  [63] "poolind"                            
##  [64] "replacementcostdwellingind"         
##  [65] "replacementvalueind"                
##  [66] "serviceline"                        
##  [67] "equipmentbreakdown"                 
##  [68] "numberoffamilies"                   
##  [69] "insuredage"                         
##  [70] "maritalstatus"                      
##  [71] "insurancescore"                     
##  [72] "overriddeninsurancescore"           
##  [73] "insurancescorevalue"                
##  [74] "insscoretiervalueband"              
##  [75] "financialstabilitytier"             
##  [76] "allcov_wp"                          
##  [77] "cova_wp"                            
##  [78] "cova_ep"                            
##  [79] "cova_deductible"                    
##  [80] "log_cova_deductible"                
##  [81] "cova_limit"                         
##  [82] "log_cova_limit"                     
##  [83] "cova_ic_nc_water"                   
##  [84] "hasclaim"                           
##  [85] "cova_il_nc_water"                   
##  [86] "log_cova_il_nc_water"               
##  [87] "water_risk_3_blk"                   
##  [88] "log_water_risk_3_blk"               
##  [89] "water_risk_fre_3_blk"               
##  [90] "log_water_risk_fre_3_blk"           
##  [91] "water_risk_sev_3_blk"               
##  [92] "log_water_risk_sev_3_blk"           
##  [93] "appl_fail_3_blk"                    
##  [94] "fixture_leak_3_blk"                 
##  [95] "pipe_froze_3_blk"                   
##  [96] "plumb_leak_3_blk"                   
##  [97] "rep_cost_3_blk"                     
##  [98] "ustructure_fail_3_blk"              
##  [99] "waterh_fail_3_blk"                  
## [100] "loaddate"                           
## [101] "customer_cnt_active_policies"       
## [102] "customer_cnt_active_policies_binned"
## [103] "cova_ic_nc_water_color"
dataset <- dataset[dataset$cova_il_nc_water>0,]

Quick Overview

str(dataset)
## 'data.frame':    11636 obs. of  103 variables:
##  $ modeldata_id                       : int  2023355 715490 1547393 883567 1255847 68156 449688 771338 307815 478115 ...
##  $ systemidstart                      : int  8610943 1821627 5824311 2381417 4210440 122953 1028121 1998334 556945 1087455 ...
##  $ systemidend                        : int  8909673 2170234 5824311 2381417 4569129 766663 1028136 1998334 684575 1087455 ...
##  $ cal_year                           : int  2020 2014 2018 2014 2016 2011 2012 2014 2011 2012 ...
##  $ startdate                          : Factor w/ 4288 levels "2009-01-01","2009-01-02",..: 4018 1827 3288 2023 2819 755 1096 1827 862 1101 ...
##  $ enddate                            : Factor w/ 4619 levels "2009-01-09","2009-01-12",..: 4299 1999 3577 2159 2890 1063 1222 2119 1063 1429 ...
##  $ startdatetm                        : Factor w/ 23690 levels "2008-01-09 00:00:00",..: 22144 9454 18011 11303 15410 3837 4617 10072 4436 6063 ...
##  $ enddatetm                          : Factor w/ 16455 levels "2009-01-09 00:00:00",..: 16134 6990 13263 8202 11321 3549 4094 7389 3987 4910 ...
##  $ ecy                                : num  0.86 0.561 0.881 0.463 0.285 ...
##  $ log_ecy                            : num  -0.151 -0.578 -0.126 -0.771 -1.256 ...
##  $ policynumber                       : Factor w/ 244668 levels "AZF0082147","AZF0221975",..: 21299 160696 219105 4636 11884 179531 150899 63697 183742 163254 ...
##  $ policy_uniqueid                    : int  1733466 629173 1336770 789985 1094957 10079 280017 696661 292070 356186 ...
##  $ policyterm                         : int  2 5 1 1 7 1 3 1 1 4 ...
##  $ policytype                         : Factor w/ 2 levels "New","Renewal": 2 2 1 1 2 1 2 1 1 2 ...
##  $ effectivedate                      : Factor w/ 4329 levels "2008-01-09","2008-01-14",..: 4009 1710 3288 2066 2862 798 932 1830 905 1144 ...
##  $ expirationdate                     : Factor w/ 4450 levels "2009-01-09","2009-01-14",..: 4130 1831 3409 2187 2983 919 1054 1951 1027 1266 ...
##  $ policystate                        : Factor w/ 3 levels "AZ","CA","NV": 1 2 2 1 1 2 2 2 2 2 ...
##  $ policyform                         : Factor w/ 9 levels "DF1","DF3","DF6",..: 9 9 8 2 9 9 9 2 9 9 ...
##  $ persistency                        : int  1 5 0 0 16 0 12 0 0 3 ...
##  $ companycd                          : int  16 17 19 1 16 17 1 17 17 17 ...
##  $ carriercd                          : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 2 1 2 2 2 ...
##  $ agency_group                       : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 593 426 292 265 316 378 316 426 378 437 ...
##  $ producername                       : Factor w/ 1269 levels "1ST CENTURY INS SVCS INC.",..: 1240 888 612 452 651 738 652 947 738 967 ...
##  $ territory                          : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 7 4 6 6 8 6 7 8 6 ...
##  $ risknumber                         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ risktype                           : Factor w/ 2 levels "Dwelling","Homeowners": 2 2 2 1 2 2 2 1 2 2 ...
##  $ yearbuilt                          : int  1979 1994 1976 2006 1992 1966 1963 2001 2004 1977 ...
##  $ log_yearbuilt                      : num  7.59 7.6 7.59 7.6 7.6 ...
##  $ sqft                               : int  1600 2000 1300 2700 1200 1600 1300 1600 1800 3200 ...
##  $ log_sqft                           : num  7.44 7.61 7.2 7.92 7.09 ...
##  $ stories                            : int  2 1 1 2 1 1 1 2 1 1 ...
##  $ roofcd                             : Factor w/ 7 levels "COMPO","MEMBRANE",..: 4 6 6 1 1 1 4 6 6 6 ...
##  $ roofcd_encd                        : int  6 7 7 8 8 8 6 7 7 7 ...
##  $ units                              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ occupancycd                        : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ occupancy_encd                     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ allperilded                        : int  1000 1000 2500 1000 1000 1000 500 1000 1000 1000 ...
##  $ waterded                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ protectionclass                    : int  1 3 0 4 6 4 2 2 4 3 ...
##  $ constructioncd                     : Factor w/ 5 levels "AF","B","F","M",..: 5 3 1 4 3 3 3 1 3 3 ...
##  $ constructioncd_encd                : int  2 5 4 1 5 5 5 4 5 5 ...
##  $ fire_risk_model_score              : int  0 0 0 -1 0 0 0 2 2 0 ...
##  $ multipolicyind                     : int  1 1 0 0 1 0 0 0 0 0 ...
##  $ multipolicyindumbrella             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ earthquakeumbrellaind              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ usagetype                          : Factor w/ 7 levels "COC","PRIMARY",..: 2 2 2 3 2 2 2 3 2 2 ...
##  $ usagetype_encd                     : int  7 7 7 6 7 7 7 6 7 7 ...
##  $ ordinanceorlawpct                  : int  10 0 0 0 0 0 10 10 0 0 ...
##  $ functionalreplacementcost          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ homegardcreditind                  : int  1 0 0 0 0 0 1 0 0 0 ...
##  $ sprinklersystem                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ landlordind                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rentersinsurance                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ firealarmtype                      : int  0 0 1 1 1 0 1 1 1 0 ...
##  $ burglaryalarmtype                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ waterdetectiondevice               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ neighborhoodcrimewatchind          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ propertymanager                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ safeguardplusind                   : int  0 0 0 0 0 1 0 1 0 1 ...
##  $ kitchenfireextinguisherind         : int  0 1 0 0 0 0 0 1 0 1 ...
##  $ gatedcommunityind                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ deadboltind                        : int  1 1 1 1 0 0 0 1 1 1 ...
##  $ poolind                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replacementcostdwellingind         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ replacementvalueind                : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ serviceline                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ equipmentbreakdown                 : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ numberoffamilies                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ insuredage                         : int  43 39 61 50 48 40 57 45 45 57 ...
##  $ maritalstatus                      : Factor w/ 5 levels "~","Divorced",..: 4 4 4 4 1 1 1 3 4 1 ...
##  $ insurancescore                     : Factor w/ 3842 levels "(DOES","~","610",..: 3294 2 2 516 2 2 2 2 2 2 ...
##  $ overriddeninsurancescore           : Factor w/ 41 levels "~","01","02",..: 17 1 1 30 40 1 1 1 1 1 ...
##  $ insurancescorevalue                : Factor w/ 178 levels "~","605","626",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insscoretiervalueband              : Factor w/ 24 levels "~","599-606",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ financialstabilitytier             : Factor w/ 42 levels "","~","01","02",..: 18 1 1 1 1 1 1 1 1 1 ...
##  $ allcov_wp                          : int  1096 688 1075 634 605 863 643 636 722 1207 ...
##  $ cova_wp                            : int  1036 964 655 598 901 852 1048 475 949 1472 ...
##  $ cova_ep                            : num  360 541 577 277 257 ...
##  $ cova_deductible                    : int  1000 1000 2500 1000 1000 1000 500 1000 1000 1000 ...
##  $ log_cova_deductible                : num  6.91 6.91 7.82 6.91 6.91 ...
##  $ cova_limit                         : int  300000 400000 300000 300000 200000 300000 300000 400000 300000 600000 ...
##  $ log_cova_limit                     : num  12.3 12.7 12.6 12.5 11.9 ...
##  $ cova_ic_nc_water                   : int  1 1 1 1 1 1 1 2 1 1 ...
##  $ hasclaim                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cova_il_nc_water                   : num  1146 13187 21404 790 22400 ...
##  $ log_cova_il_nc_water               : num  7.04 9.49 9.97 6.67 10.02 ...
##  $ water_risk_3_blk                   : int  165 112 593 168 190 219 136 245 212 222 ...
##  $ log_water_risk_3_blk               : num  5.11 4.72 6.39 5.12 5.25 ...
##  $ water_risk_fre_3_blk               : int  244 100 484 174 188 143 122 206 166 175 ...
##  $ log_water_risk_fre_3_blk           : num  5.5 4.61 6.18 5.16 5.24 ...
##  $ water_risk_sev_3_blk               : int  70 115 126 99 104 157 115 122 132 131 ...
##  $ log_water_risk_sev_3_blk           : num  4.25 4.74 4.84 4.6 4.64 ...
##  $ appl_fail_3_blk                    : int  1 5 5 1 5 4 5 5 5 4 ...
##  $ fixture_leak_3_blk                 : int  4 3 2 4 5 2 2 2 2 2 ...
##  $ pipe_froze_3_blk                   : int  0 1 0 3 3 2 3 0 2 3 ...
##  $ plumb_leak_3_blk                   : int  4 4 4 5 5 4 1 5 4 4 ...
##  $ rep_cost_3_blk                     : int  4 1 5 1 0 5 5 5 5 5 ...
##  $ ustructure_fail_3_blk              : int  5 5 5 5 5 5 5 5 5 2 ...
##  $ waterh_fail_3_blk                  : int  2 2 0 5 5 2 2 2 2 4 ...
##   [list output truncated]
summary(dataset)
##   modeldata_id     systemidstart      systemidend         cal_year   
##  Min.   :    179   Min.   :    745   Min.   :    745   Min.   :2009  
##  1st Qu.: 547915   1st Qu.:1259699   1st Qu.:1376745   1st Qu.:2013  
##  Median :1039204   Median :2963060   Median :3133598   Median :2016  
##  Mean   :1029895   Mean   :3542800   Mean   :3679403   Mean   :2015  
##  3rd Qu.:1535314   3rd Qu.:5742663   3rd Qu.:5942212   3rd Qu.:2018  
##  Max.   :2102494   Max.   :9136529   Max.   :9647449   Max.   :2020  
##                                                                      
##       startdate          enddate                  startdatetm   
##  2019-01-01: 860   2019-01-01: 794   2018-05-01 00:00:00:   14  
##  2018-01-01: 738   2018-01-01: 635   2016-10-01 00:00:00:   13  
##  2017-01-01: 573   2017-01-01: 601   2018-02-28 00:00:00:   12  
##  2016-01-01: 557   2020-01-01: 577   2017-09-01 00:00:00:   11  
##  2013-01-01: 494   2015-01-01: 505   2018-08-17 00:00:00:   11  
##  2014-01-01: 486   2016-01-01: 499   2016-10-15 00:00:00:   10  
##  (Other)   :7928   (Other)   :8025   (Other)            :11565  
##                enddatetm          ecy            log_ecy         
##  2019-05-01 00:00:00:   17   Min.   :0.0054   Min.   :-5.221356  
##  2018-08-01 00:00:00:   14   1st Qu.:0.4572   1st Qu.:-0.782634  
##  2018-09-01 00:00:00:   14   Median :0.6625   Median :-0.411735  
##  2019-07-01 00:00:00:   14   Mean   :0.6343   Mean   :-0.560624  
##  2017-10-01 00:00:00:   13   3rd Qu.:0.8350   3rd Qu.:-0.180324  
##  2019-02-28 00:00:00:   13   Max.   :1.0020   Max.   : 0.001998  
##  (Other)            :11551                                       
##      policynumber   policy_uniqueid     policyterm       policytype  
##  CAH6012203:    6   Min.   :    114   Min.   : 1.000   New    :2704  
##  CAH0704671:    4   1st Qu.: 495973   1st Qu.: 2.000   Renewal:8932  
##  CAH0751641:    4   Median : 912645   Median : 3.000                 
##  CAH0752785:    4   Mean   : 895551   Mean   : 3.818                 
##  CAH0755516:    4   3rd Qu.:1323472   3rd Qu.: 6.000                 
##  CAH0755561:    4   Max.   :1794004   Max.   :12.000                 
##  (Other)   :11610                                                    
##     effectivedate      expirationdate  policystate       policyform  
##  2018-05-01:   17   2019-05-01:   17   AZ: 1076    HO3        :7714  
##  2017-08-01:   15   2018-08-01:   15   CA:10127    DF3        :2961  
##  2016-10-01:   14   2017-10-01:   14   NV:  433    DF6        : 563  
##  2018-07-01:   14   2019-07-01:   14               Form3      : 323  
##  2018-08-01:   14   2019-08-01:   14               FL3-Special:  53  
##  2018-08-17:   14   2019-08-17:   14               FL2-Broad  :  14  
##  (Other)   :11548   (Other)   :11548               (Other)    :   8  
##   persistency       companycd      carriercd   
##  Min.   : 0.000   Min.   : 1.00   CSEICO:4629  
##  1st Qu.: 1.000   1st Qu.: 1.00   CSESG :7007  
##  Median : 3.000   Median :17.00                
##  Mean   : 6.188   Mean   :11.51                
##  3rd Qu.: 9.000   3rd Qu.:17.00                
##  Max.   :44.000   Max.   :19.00                
##                                                
##                                        agency_group 
##  WESTERN GOLD INS AGCY INC.                  : 870  
##  J.E. BROWN and ASSOCS INS SVCS              : 488  
##  CRUSBERG DECKER INS SVCS INC                : 463  
##  Acrisure of California                      : 461  
##  ISU INSURANCE SERVICES OF  SAN FRANCISCO INC: 430  
##  PIIB - PACIFIC INTERSTATE INS               : 310  
##  (Other)                                     :8614  
##                          producername    territory      risknumber   
##  WESTERN GOLD INS AGCY INC.    : 853   CA-B   :3677   Min.   :1.000  
##  J.E. BROWN and ASSOCS INS SVCS: 465   CA-C   :3008   1st Qu.:1.000  
##  CRUSBERG DECKER INS SVCS INC  : 291   CA-A   :1467   Median :1.000  
##  Acrisure of California LLC    : 247   CA-O   :1233   Mean   :1.001  
##  HUB INTERNATIONAL INS SVCS    : 169   AZ-A   : 929   3rd Qu.:1.000  
##  MULTI-STATE INS SERVICES INC  : 166   CA-T   : 812   Max.   :2.000  
##  (Other)                       :9445   (Other): 510                  
##        risktype      yearbuilt    log_yearbuilt        sqft     
##  Dwelling  :3599   Min.   :1900   Min.   :7.533   Min.   : 800  
##  Homeowners:8037   1st Qu.:1972   1st Qu.:7.587   1st Qu.:1500  
##                    Median :1987   Median :7.594   Median :1900  
##                    Mean   :1982   Mean   :7.592   Mean   :2086  
##                    3rd Qu.:1996   3rd Qu.:7.599   3rd Qu.:2500  
##                    Max.   :2019   Max.   :7.610   Max.   :5000  
##                                                                 
##     log_sqft        stories          roofcd      roofcd_encd   
##  Min.   :6.397   Min.   :1.00   COMPO   :4228   Min.   :1.000  
##  1st Qu.:7.313   1st Qu.:1.00   MEMBRANE: 127   1st Qu.:7.000  
##  Median :7.576   Median :1.00   METAL   :  32   Median :7.000  
##  Mean   :7.585   Mean   :1.26   OTHER   :1646   Mean   :7.073  
##  3rd Qu.:7.862   3rd Qu.:2.00   TAR     : 210   3rd Qu.:8.000  
##  Max.   :9.051   Max.   :3.00   TILE    :5271   Max.   :8.000  
##                                 WOOD    : 122                  
##      units            occupancycd    occupancy_encd   allperilded   
##  Min.   :1.000   NO         :    0   Min.   :1.000   Min.   :  100  
##  1st Qu.:1.000   OCCUPIEDNOW:11093   1st Qu.:1.000   1st Qu.:  500  
##  Median :1.000   TENANT     :  543   Median :1.000   Median : 1000  
##  Mean   :1.089                       Mean   :1.047   Mean   : 1156  
##  3rd Qu.:1.000                       3rd Qu.:1.000   3rd Qu.: 1000  
##  Max.   :4.000                       Max.   :2.000   Max.   :10000  
##                                                                     
##     waterded        protectionclass  constructioncd constructioncd_encd
##  Min.   :    0.00   Min.   : 0.000   AF   :4413     Min.   :1.000      
##  1st Qu.:    0.00   1st Qu.: 2.000   B    : 178     1st Qu.:4.000      
##  Median :    0.00   Median : 3.000   F    :6744     Median :5.000      
##  Mean   :   39.96   Mean   : 3.073   M    : 162     Mean   :4.499      
##  3rd Qu.:    0.00   3rd Qu.: 4.000   OTHER: 139     3rd Qu.:5.000      
##  Max.   :10000.00   Max.   :10.000                  Max.   :5.000      
##                                                                        
##  fire_risk_model_score multipolicyind   multipolicyindumbrella
##  Min.   :-1.0000       Min.   :0.0000   Min.   :0.000000      
##  1st Qu.: 0.0000       1st Qu.:0.0000   1st Qu.:0.000000      
##  Median : 0.0000       Median :0.0000   Median :0.000000      
##  Mean   : 0.3019       Mean   :0.1828   Mean   :0.004727      
##  3rd Qu.: 0.0000       3rd Qu.:0.0000   3rd Qu.:0.000000      
##  Max.   :12.0000       Max.   :1.0000   Max.   :1.000000      
##                                                               
##  earthquakeumbrellaind      usagetype    usagetype_encd  ordinanceorlawpct
##  Min.   :0.00000       COC       :   4   Min.   :2.000   Min.   :  0.000  
##  1st Qu.:0.00000       PRIMARY   :8212   1st Qu.:6.000   1st Qu.:  0.000  
##  Median :0.00000       RENTAL    :3374   Median :7.000   Median : 10.000  
##  Mean   :0.00318       SEASONAL  :  36   Mean   :6.693   Mean   :  9.297  
##  3rd Qu.:0.00000       SECONDARY :   8   3rd Qu.:7.000   3rd Qu.: 10.000  
##  Max.   :1.00000       UNOCCUPIED:   0   Max.   :7.000   Max.   :100.000  
##                        VACANT    :   2                                    
##  functionalreplacementcost homegardcreditind sprinklersystem  
##  Min.   :0.0000000         Min.   :0.000     Min.   :0.00000  
##  1st Qu.:0.0000000         1st Qu.:0.000     1st Qu.:0.00000  
##  Median :0.0000000         Median :0.000     Median :0.00000  
##  Mean   :0.0008594         Mean   :0.208     Mean   :0.03017  
##  3rd Qu.:0.0000000         3rd Qu.:0.000     3rd Qu.:0.00000  
##  Max.   :1.0000000         Max.   :1.000     Max.   :1.00000  
##                                                               
##   landlordind      rentersinsurance   firealarmtype    burglaryalarmtype
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000   
##  Median :0.00000   Median :0.000000   Median :1.0000   Median :0.0000   
##  Mean   :0.03644   Mean   :0.004383   Mean   :0.6368   Mean   :0.3304   
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.0000   Max.   :1.0000   
##                                                                         
##  waterdetectiondevice neighborhoodcrimewatchind propertymanager 
##  Min.   :0.0000000    Min.   :0.00000           Min.   :0.0000  
##  1st Qu.:0.0000000    1st Qu.:0.00000           1st Qu.:0.0000  
##  Median :0.0000000    Median :0.00000           Median :0.0000  
##  Mean   :0.0001719    Mean   :0.01934           Mean   :0.0122  
##  3rd Qu.:0.0000000    3rd Qu.:0.00000           3rd Qu.:0.0000  
##  Max.   :1.0000000    Max.   :1.00000           Max.   :1.0000  
##                                                                 
##  safeguardplusind kitchenfireextinguisherind gatedcommunityind
##  Min.   :0.0000   Min.   :0.000              Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.000              1st Qu.:0.00000  
##  Median :0.0000   Median :0.000              Median :0.00000  
##  Mean   :0.3682   Mean   :0.419              Mean   :0.01916  
##  3rd Qu.:1.0000   3rd Qu.:1.000              3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.000              Max.   :1.00000  
##                                                               
##   deadboltind        poolind        replacementcostdwellingind
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000            
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000            
##  Median :1.0000   Median :0.00000   Median :0.0000            
##  Mean   :0.6983   Mean   :0.06033   Mean   :0.2474            
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000            
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000            
##                                                               
##  replacementvalueind  serviceline     equipmentbreakdown numberoffamilies
##  Min.   :0.00000     Min.   :0.0000   Min.   :0.0000     Min.   :1.000   
##  1st Qu.:0.00000     1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:1.000   
##  Median :0.00000     Median :0.0000   Median :0.0000     Median :1.000   
##  Mean   :0.02424     Mean   :0.1282   Mean   :0.1384     Mean   :1.089   
##  3rd Qu.:0.00000     3rd Qu.:0.0000   3rd Qu.:0.0000     3rd Qu.:1.000   
##  Max.   :1.00000     Max.   :1.0000   Max.   :1.0000     Max.   :4.000   
##                                                                          
##    insuredage      maritalstatus  insurancescore  overriddeninsurancescore
##  Min.   :  0.00   ~       :4882   ~      :10927   ~      :10490           
##  1st Qu.: 45.00   Divorced:  67   99     :    5   99     :  396           
##  Median : 55.00   Married :3375   DWKDS  :    3   07     :   81           
##  Mean   : 55.11   Single  :3180   FUQQS  :    3   12     :   63           
##  3rd Qu.: 65.00   Widowed : 132   SSDDV  :    3   13     :   50           
##  Max.   :114.00                   VVKVS  :    3   14     :   48           
##  NA's   :1086                     (Other):  692   (Other):  508           
##  insurancescorevalue insscoretiervalueband financialstabilitytier
##  ~      :11629       ~      :11629                :10945         
##  641    :    1       714-731:    2         ~      :  610         
##  666    :    1       640-648:    1         07     :    8         
##  680    :    1       664-672:    1         14     :    8         
##  729    :    1       672-682:    1         08     :    6         
##  731    :    1       820-837:    1         13     :    6         
##  (Other):    2       (Other):    1         (Other):   53         
##    allcov_wp         cova_wp        cova_ep         cova_deductible
##  Min.   :  48.0   Min.   :   9   Min.   :   1.428   Min.   :  100  
##  1st Qu.: 644.8   1st Qu.: 609   1st Qu.: 309.908   1st Qu.:  500  
##  Median : 901.0   Median : 939   Median : 553.833   Median : 1000  
##  Mean   :1014.0   Mean   :1144   Mean   : 719.183   Mean   : 1156  
##  3rd Qu.:1228.0   3rd Qu.:1469   3rd Qu.: 940.123   3rd Qu.: 1000  
##  Max.   :7588.0   Max.   :9468   Max.   :8631.976   Max.   :10000  
##                                                                    
##  log_cova_deductible   cova_limit      log_cova_limit   cova_ic_nc_water
##  Min.   :4.605       Min.   : 100000   Min.   : 8.372   Min.   :1.000   
##  1st Qu.:6.215       1st Qu.: 300000   1st Qu.:12.476   1st Qu.:1.000   
##  Median :6.908       Median : 400000   Median :12.799   Median :1.000   
##  Mean   :6.868       Mean   : 449742   Mean   :12.759   Mean   :1.023   
##  3rd Qu.:6.908       3rd Qu.: 500000   3rd Qu.:13.115   3rd Qu.:1.000   
##  Max.   :9.210       Max.   :1300000   Max.   :14.644   Max.   :3.000   
##                                                                         
##     hasclaim cova_il_nc_water   log_cova_il_nc_water water_risk_3_blk
##  Min.   :1   Min.   :     0.9   Min.   :-0.1054      Min.   :  23.0  
##  1st Qu.:1   1st Qu.:  2942.6   1st Qu.: 7.9870      1st Qu.: 159.0  
##  Median :1   Median :  7082.0   Median : 8.8653      Median : 210.0  
##  Mean   :1   Mean   : 13614.2   Mean   : 8.7738      Mean   : 228.5  
##  3rd Qu.:1   3rd Qu.: 15967.6   3rd Qu.: 9.6783      3rd Qu.: 271.0  
##  Max.   :1   Max.   :522735.2   Max.   :13.1668      Max.   :1056.0  
##                                                                      
##  log_water_risk_3_blk water_risk_fre_3_blk log_water_risk_fre_3_blk
##  Min.   :3.135        Min.   :  21.0       Min.   :3.045           
##  1st Qu.:5.069        1st Qu.: 125.0       1st Qu.:4.828           
##  Median :5.347        Median : 175.0       Median :5.165           
##  Mean   :5.334        Mean   : 186.5       Mean   :5.124           
##  3rd Qu.:5.602        3rd Qu.: 223.0       3rd Qu.:5.407           
##  Max.   :6.962        Max.   :1044.0       Max.   :6.951           
##                                                                    
##  water_risk_sev_3_blk log_water_risk_sev_3_blk appl_fail_3_blk
##  Min.   : 35.0        Min.   :3.555            Min.   :0.000  
##  1st Qu.:108.0        1st Qu.:4.682            1st Qu.:4.000  
##  Median :128.0        Median :4.852            Median :5.000  
##  Mean   :130.4        Mean   :4.843            Mean   :4.257  
##  3rd Qu.:150.0        3rd Qu.:5.011            3rd Qu.:5.000  
##  Max.   :265.0        Max.   :5.580            Max.   :5.000  
##                                                               
##  fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk rep_cost_3_blk
##  Min.   :0.000      Min.   :0.000    Min.   :0.000    Min.   :0.00  
##  1st Qu.:1.000      1st Qu.:0.000    1st Qu.:1.000    1st Qu.:5.00  
##  Median :2.000      Median :2.000    Median :4.000    Median :5.00  
##  Mean   :1.903      Mean   :1.568    Mean   :3.359    Mean   :4.82  
##  3rd Qu.:3.000      3rd Qu.:2.000    3rd Qu.:5.000    3rd Qu.:5.00  
##  Max.   :5.000      Max.   :5.000    Max.   :5.000    Max.   :5.00  
##                                                                     
##  ustructure_fail_3_blk waterh_fail_3_blk                    loaddate    
##  Min.   :0.00          Min.   :0.000     2020-08-07 08:09:37.497:11636  
##  1st Qu.:4.00          1st Qu.:0.000                                    
##  Median :5.00          Median :2.000                                    
##  Mean   :4.33          Mean   :1.331                                    
##  3rd Qu.:5.00          3rd Qu.:2.000                                    
##  Max.   :5.00          Max.   :5.000                                    
##                                                                         
##  customer_cnt_active_policies customer_cnt_active_policies_binned
##  Min.   : 1.000               Min.   : 1.000                     
##  1st Qu.: 1.000               1st Qu.: 1.000                     
##  Median : 1.000               Median : 1.000                     
##  Mean   : 1.247               Mean   : 1.839                     
##  3rd Qu.: 1.000               3rd Qu.: 1.000                     
##  Max.   :28.000               Max.   :30.000                     
##                                                                  
##  cova_ic_nc_water_color
##  Min.   :1.000         
##  1st Qu.:1.000         
##  Median :1.000         
##  Mean   :1.023         
##  3rd Qu.:1.000         
##  Max.   :3.000         
## 

Visualization

Severity

ggplot(dataset, aes(x = .data[['cova_il_nc_water']], color="red",fill="#56B4E9")) +
  geom_histogram(bins=100) +
  labs(x = 'cova_il_nc_water', y = 'Count', title = "Histogram of Losses (cova_il_nc_water)")+
  scale_color_manual(values=c("red")) +  
  scale_fill_manual(values=c("#56B4E9")) +  
  theme(legend.position = "none")

ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'cova_il_nc_water', y = 'Density', title = "Density of Losses (cova_il_nc_water)")

  vec <- dataset$cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'cova_il_nc_water', title = "QQ Plot of Losses (cova_il_nc_water)")

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']], color="red",fill="#56B4E9")) +
  geom_histogram(bins=100) +
  scale_color_manual(values=c("red")) +  
  scale_fill_manual(values=c("#56B4E9")) + 
  labs(x = 'log(cova_il_nc_water)', y = 'Count', title = "Histogram of Losses (log_cova_il_nc_water)")+
  theme(legend.position = "none")

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of Losses (log_cova_il_nc_water)')

  vec <- dataset$log_cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of Losses (log_cova_il_nc_water)')

Looks like gamma distribution and log conversion is more close to normal.

Box Plots of categorical predictors and Losses

plot_categorical <- function(col_name) {
#dataset[[col_name]] <- as.factor(dataset[[col_name]])
#ggplot(dataset, aes(x=.data[[col_name]], y=cova_il_nc_water, col=.data[[col_name]], fill=.data[[col_name]])) +
#  geom_boxplot(notch=notch) +
#  theme(legend.position = "none") +
#  labs(x = col_name, title = paste("Box Plot of Losses and ", col_name))
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
} 
  else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
} 
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
} 
  else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
} 
  else {
  col_name_original <- col_name
}
  
if (grepl('encd',col_name) | 
    col_name =='fire_risk_model_score' | 
    col_name =='customer_cnt_active_policies_binned' |
    col_name =='cova_deductible' | 
    col_name =='cova_limit' | 
    col_name =='protectionclass' | 
    col_name =='ordinanceorlawpct' | 
    col_name =='numberoffamilies' | 
    col_name =='waterded' | 
    col_name =='units' | 
    col_name =='sprinklersystem' |
    col_name =='stories') {
  XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
  XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)]))) 
} 
else if (col_name =='pipe_froze_3_blk' | 
col_name =='water_risk_3_blk' | 
col_name =='ustructure_fail_3_blk' | 
col_name =='water_risk_fre_3_blk' | 
col_name =='waterh_fail_3_blk' | 
col_name =='rep_cost_3_blk' | 
col_name =='plumb_leak_3_blk' | 
col_name =='appl_fail_3_blk' | 
col_name =='fixture_leak_3_blk') {
  XBreaks <- c(0,1,2,3,4,5)
  XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')    
}
else {
  XBreaks <- c(0,1)
  XLabels <- c("No", "Yes")  

}
  
pd_col <- pd_gamma_dataset[pd_gamma_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
  group_by(.dots = col_name) %>%
  dplyr::summarize(n = n(),
         mean = mean(cova_il_nc_water),
         sd = sd(cova_il_nc_water),
         se = sd / sqrt(n) #,
         #ci = qt(0.975, df = n - 1) * sd / sqrt(n)
         )

plotdata <- merge(plotdata,pd_col,by=col_name)  
cols <- c('Mean Losses with standard error'='#f04546','Losses Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +  
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

print(p1)
print(p2)

#----------------------
pd_col <- pd_normal_dataset[pd_normal_dataset$feature == col_name,][c('value','pd')]
if (nrow(pd_col)>0) {
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
  group_by(.dots = col_name) %>%
  dplyr::summarize(n = n(),
         mean = mean(log_cova_il_nc_water),
         sd = sd(log_cova_il_nc_water),
         se = sd / sqrt(n) #,
         #ci = qt(0.975, df = n - 1) * sd / sqrt(n)
         )

plotdata <- merge(plotdata,pd_col,by=col_name)  

cols <- c('Mean Losses (Log) with standard error'='#f04546','Losses (Log) Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses (Log) with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +    
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +  
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels)   +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

print(p1)
print(p2)
}
}
plot_categorical('functionalreplacementcost')

plot_categorical('numberoffamilies')

plot_categorical('units')

plot_categorical('stories')

plot_categorical('cova_deductible')

plot_categorical('replacementvalueind')

plot_categorical('neighborhoodcrimewatchind')

plot_categorical('fire_risk_model_score')

ggplot(data=dataset, mapping = aes(x=water_risk_sev_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_sev_3_blk")

The higest losses are between 100 and 200 water_risk_sev_3_blk. We may have not enough data for higher numbers of water_risk_sev_3_blk

plot_categorical('burglaryalarmtype')

plot_categorical('deadboltind')

plot_categorical('waterh_fail_3_blk')

plot_categorical('safeguardplusind')

plot_categorical('equipmentbreakdown')

plot_categorical('cova_limit')

ggplot(data=dataset, mapping = aes(x=water_risk_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_3_blk")

There are more higher losses around 250 water_risk_3_blk

plot_categorical('usagetype_encd')

plot_categorical('multipolicyindumbrella')

plot_categorical('constructioncd_encd')

plot_categorical('replacementcostdwellingind')

plot_categorical('roofcd_encd')

plot_categorical('gatedcommunityind')

plot_categorical('homegardcreditind')

plot_categorical('ustructure_fail_3_blk')

ggplot(data=dataset, mapping = aes(x=water_risk_fre_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs water_risk_fre_3_blk")

plot_categorical('ecy')

ggplot(data=dataset, mapping = aes(x=ecy, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs ecy")

plot_categorical('appl_fail_3_blk')

plot_categorical('pipe_froze_3_blk')

plot_categorical('firealarmtype')

plot_categorical('fixture_leak_3_blk')

plot_categorical('sqft')

ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs sqft")

Severity is higher in larger properties

plot_categorical('yearbuilt')

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs YearBuilt")

Severity is higher in newer properties

plot_categorical('customer_cnt_active_policies_binned')

ggplot(data=dataset, mapping = aes(x=customer_cnt_active_policies_binned, y=cova_il_nc_water, color=-cova_il_nc_water)) +
  geom_point() +
  labs(title = "Losses vs customer_cnt_active_policies_binned")

plot_categorical('protectionclass')

plot_categorical('multipolicyind')

plot_categorical('rep_cost_3_blk')

plot_categorical('plumb_leak_3_blk')

plot_categorical('rentersinsurance')

plot_categorical('sprinklersystem')

plot_categorical('kitchenfireextinguisherind')

plot_categorical('poolind')

plot_categorical('landlordind')

plot_categorical('occupancy_encd')

plot_categorical('ordinanceorlawpct')

plot_categorical('waterded')

plot_categorical('serviceline')

plot_categorical('propertymanager')

plot_categorical('earthquakeumbrellaind')

Misc

ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=roofcd)) +
  geom_point() +
  labs(title = "Losses vs YearBuilt and Roofcd Limit")

ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_limit)) +
  geom_point() +
  labs(title = "Losses vs Sqft and CovA Limit")