dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
library(vcd)
## Loading required package: grid
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(fitdistrplus)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.5.3
## Loading required package: survival
colnames(dataset)
##   [1] "modeldata_id"                       
##   [2] "systemidstart"                      
##   [3] "systemidend"                        
##   [4] "cal_year"                           
##   [5] "startdate"                          
##   [6] "enddate"                            
##   [7] "startdatetm"                        
##   [8] "enddatetm"                          
##   [9] "ecy"                                
##  [10] "log_ecy"                            
##  [11] "policynumber"                       
##  [12] "policy_uniqueid"                    
##  [13] "policyterm"                         
##  [14] "policytype"                         
##  [15] "effectivedate"                      
##  [16] "expirationdate"                     
##  [17] "policystate"                        
##  [18] "policyform"                         
##  [19] "persistency"                        
##  [20] "companycd"                          
##  [21] "carriercd"                          
##  [22] "agency_group"                       
##  [23] "producername"                       
##  [24] "territory"                          
##  [25] "risknumber"                         
##  [26] "risktype"                           
##  [27] "yearbuilt"                          
##  [28] "log_yearbuilt"                      
##  [29] "sqft"                               
##  [30] "log_sqft"                           
##  [31] "stories"                            
##  [32] "roofcd"                             
##  [33] "roofcd_encd"                        
##  [34] "units"                              
##  [35] "occupancycd"                        
##  [36] "occupancy_encd"                     
##  [37] "allperilded"                        
##  [38] "waterded"                           
##  [39] "protectionclass"                    
##  [40] "constructioncd"                     
##  [41] "constructioncd_encd"                
##  [42] "fire_risk_model_score"              
##  [43] "multipolicyind"                     
##  [44] "multipolicyindumbrella"             
##  [45] "earthquakeumbrellaind"              
##  [46] "usagetype"                          
##  [47] "usagetype_encd"                     
##  [48] "ordinanceorlawpct"                  
##  [49] "functionalreplacementcost"          
##  [50] "homegardcreditind"                  
##  [51] "sprinklersystem"                    
##  [52] "landlordind"                        
##  [53] "rentersinsurance"                   
##  [54] "firealarmtype"                      
##  [55] "burglaryalarmtype"                  
##  [56] "waterdetectiondevice"               
##  [57] "neighborhoodcrimewatchind"          
##  [58] "propertymanager"                    
##  [59] "safeguardplusind"                   
##  [60] "kitchenfireextinguisherind"         
##  [61] "gatedcommunityind"                  
##  [62] "deadboltind"                        
##  [63] "poolind"                            
##  [64] "replacementcostdwellingind"         
##  [65] "replacementvalueind"                
##  [66] "serviceline"                        
##  [67] "equipmentbreakdown"                 
##  [68] "numberoffamilies"                   
##  [69] "insuredage"                         
##  [70] "maritalstatus"                      
##  [71] "insurancescore"                     
##  [72] "overriddeninsurancescore"           
##  [73] "insurancescorevalue"                
##  [74] "insscoretiervalueband"              
##  [75] "financialstabilitytier"             
##  [76] "allcov_wp"                          
##  [77] "cova_wp"                            
##  [78] "cova_ep"                            
##  [79] "cova_deductible"                    
##  [80] "log_cova_deductible"                
##  [81] "cova_limit"                         
##  [82] "log_cova_limit"                     
##  [83] "cova_ic_nc_water"                   
##  [84] "hasclaim"                           
##  [85] "cova_il_nc_water"                   
##  [86] "log_cova_il_nc_water"               
##  [87] "water_risk_3_blk"                   
##  [88] "log_water_risk_3_blk"               
##  [89] "water_risk_fre_3_blk"               
##  [90] "log_water_risk_fre_3_blk"           
##  [91] "water_risk_sev_3_blk"               
##  [92] "log_water_risk_sev_3_blk"           
##  [93] "appl_fail_3_blk"                    
##  [94] "fixture_leak_3_blk"                 
##  [95] "pipe_froze_3_blk"                   
##  [96] "plumb_leak_3_blk"                   
##  [97] "rep_cost_3_blk"                     
##  [98] "ustructure_fail_3_blk"              
##  [99] "waterh_fail_3_blk"                  
## [100] "loaddate"                           
## [101] "customer_cnt_active_policies"       
## [102] "customer_cnt_active_policies_binned"

Quick Overview

str(dataset)
## 'data.frame':    1995765 obs. of  102 variables:
##  $ modeldata_id                       : int  534254 1793227 880091 1653118 353309 1289956 305141 1115709 2075469 1924235 ...
##  $ systemidstart                      : int  1219556 7403270 2370713 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
##  $ systemidend                        : int  1219556 7403270 2955483 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
##  $ cal_year                           : int  2013 2018 2014 2018 2011 2016 2012 2015 2020 2020 ...
##  $ startdate                          : Factor w/ 4290 levels "2009-01-01","2009-01-02",..: 1462 3611 2067 3424 1030 2878 1096 2544 4093 4018 ...
##  $ enddate                            : Factor w/ 4620 levels "2009-01-09","2009-01-12",..: 1553 3620 2159 3431 1044 2890 1225 2524 4351 4156 ...
##  $ startdatetm                        : Factor w/ 23752 levels "2008-01-09 00:00:00",..: 6832 20120 11523 19086 5616 15738 4639 13893 22823 21287 ...
##  $ enddatetm                          : Factor w/ 16484 levels "2009-01-09 00:00:00",..: 5396 14951 8350 12631 3384 11614 4105 10088 16289 15885 ...
##  $ ecy                                : num  0.339 0.115 0.342 0.11 0.129 ...
##  $ log_ecy                            : num  -1.08 -2.16 -1.07 -2.21 -2.05 ...
##  $ policynumber                       : Factor w/ 247043 levels "AZF0082147","AZF0221975",..: 33600 194123 187155 140475 2424 15381 185108 164391 100140 201424 ...
##  $ policy_uniqueid                    : int  445056 1541369 786905 1427784 348649 1124000 292060 976554 1776548 1654555 ...
##  $ policyterm                         : int  4 7 4 10 1 3 1 7 4 6 ...
##  $ policytype                         : Factor w/ 2 levels "New","Renewal": 2 2 2 2 1 2 1 2 2 2 ...
##  $ effectivedate                      : Factor w/ 4331 levels "2008-01-09","2008-01-14",..: 1264 3654 2110 3467 1073 2921 935 2587 4136 3866 ...
##  $ expirationdate                     : Factor w/ 4451 levels "2009-01-09","2009-01-14",..: 1385 3775 2231 3588 1195 3042 1057 2709 4256 3987 ...
##  $ policystate                        : Factor w/ 3 levels "AZ","CA","NV": 2 2 2 2 1 1 2 2 2 2 ...
##  $ policyform                         : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 9 9 9 2 9 9 9 2 9 ...
##  $ persistency                        : int  3 6 3 25 0 2 0 7 3 5 ...
##  $ companycd                          : int  1 17 17 1 1 16 17 17 17 1 ...
##  $ carriercd                          : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 1 2 2 2 1 ...
##  $ agency_group                       : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 446 582 89 50 265 265 378 426 426 426 ...
##  $ producername                       : Factor w/ 1271 levels "1ST CENTURY INS SVCS INC.",..: 983 1226 142 78 452 452 739 889 948 948 ...
##  $ territory                          : Factor w/ 10 levels "","AZ-A","AZ-T",..: 8 6 6 4 6 6 8 7 7 7 ...
##  $ risknumber                         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ risktype                           : Factor w/ 2 levels "Dwelling","Homeowners": 1 2 2 2 1 2 2 2 1 2 ...
##  $ yearbuilt                          : int  1986 1986 1994 1951 1974 1997 1990 1958 2005 1984 ...
##  $ log_yearbuilt                      : num  7.59 7.59 7.6 7.58 7.59 ...
##  $ sqft                               : int  1500 2200 2600 1100 1200 1400 1700 1100 2700 1300 ...
##  $ log_sqft                           : num  7.33 7.73 7.87 7.04 7.12 ...
##  $ stories                            : int  1 2 2 1 1 1 1 1 2 2 ...
##  $ roofcd                             : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 4 1 1 6 1 6 1 ...
##  $ roofcd_encd                        : int  7 7 7 6 8 8 7 8 7 8 ...
##  $ units                              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ occupancycd                        : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ occupancy_encd                     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ allperilded                        : int  500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
##  $ waterded                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ protectionclass                    : int  3 2 3 3 2 2 4 3 4 4 ...
##  $ constructioncd                     : Factor w/ 5 levels "AF","B","F","M",..: 3 1 1 3 2 5 3 3 1 1 ...
##  $ constructioncd_encd                : int  5 4 4 5 3 2 5 5 4 4 ...
##  $ fire_risk_model_score              : int  0 2 0 0 -1 -1 2 0 1 2 ...
##  $ multipolicyind                     : int  0 0 0 1 0 0 1 0 0 0 ...
##  $ multipolicyindumbrella             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ earthquakeumbrellaind              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ usagetype                          : Factor w/ 7 levels "COC","PRIMARY",..: 3 2 2 2 3 2 2 2 3 2 ...
##  $ usagetype_encd                     : int  6 7 7 7 6 7 7 7 6 7 ...
##  $ ordinanceorlawpct                  : int  10 20 0 10 0 0 0 0 10 10 ...
##  $ functionalreplacementcost          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ homegardcreditind                  : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ sprinklersystem                    : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ landlordind                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rentersinsurance                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ firealarmtype                      : int  0 0 1 1 1 1 1 0 1 1 ...
##  $ burglaryalarmtype                  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ waterdetectiondevice               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ neighborhoodcrimewatchind          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ propertymanager                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ safeguardplusind                   : int  0 1 0 0 0 0 0 1 1 0 ...
##  $ kitchenfireextinguisherind         : int  1 1 1 0 0 0 0 1 0 0 ...
##  $ gatedcommunityind                  : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ deadboltind                        : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ poolind                            : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ replacementcostdwellingind         : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ replacementvalueind                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ serviceline                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ equipmentbreakdown                 : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ numberoffamilies                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ insuredage                         : int  66 61 39 92 79 36 72 59 55 69 ...
##  $ maritalstatus                      : Factor w/ 5 levels "~","Divorced",..: 1 3 3 1 4 4 4 1 3 3 ...
##  $ insurancescore                     : Factor w/ 3850 levels "(DOES","~","610",..: 2 2 2 2 2 3817 2 2 2 2 ...
##  $ overriddeninsurancescore           : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 13 1 1 1 1 ...
##  $ insurancescorevalue                : Factor w/ 262 levels "~","568","602",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ insscoretiervalueband              : Factor w/ 29 levels "~","559-599",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ financialstabilitytier             : Factor w/ 42 levels "","~","01","02",..: 1 2 1 1 1 1 1 1 2 2 ...
##  $ allcov_wp                          : num  1029 1938 915 915 336 ...
##  $ cova_wp                            : num  1029 1342 1732 1721 282 ...
##  $ cova_ep                            : num  349.2 154.2 592.7 188.4 36.3 ...
##  $ cova_deductible                    : int  500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
##  $ log_cova_deductible                : num  6.21 6.91 7.82 6.91 6.91 ...
##  $ cova_limit                         : int  600000 600000 600000 400000 200000 200000 300000 300000 700000 400000 ...
##  $ log_cova_limit                     : num  13.2 13.3 13.2 12.7 11.7 ...
##  $ cova_ic_nc_water                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hasclaim                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cova_il_nc_water                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_cova_il_nc_water               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ water_risk_3_blk                   : int  230 329 213 94 234 183 329 128 171 237 ...
##  $ log_water_risk_3_blk               : num  5.44 5.8 5.36 4.54 5.46 ...
##  $ water_risk_fre_3_blk               : int  152 218 134 53 256 226 218 83 159 180 ...
##  $ log_water_risk_fre_3_blk           : num  5.02 5.38 4.9 3.97 5.55 ...
##  $ water_risk_sev_3_blk               : int  156 155 163 182 94 83 155 159 110 136 ...
##  $ log_water_risk_sev_3_blk           : num  5.05 5.04 5.09 5.2 4.54 ...
##  $ appl_fail_3_blk                    : int  5 4 1 5 5 5 5 2 5 5 ...
##  $ fixture_leak_3_blk                 : int  4 2 1 0 3 5 2 2 0 1 ...
##  $ pipe_froze_3_blk                   : int  0 0 0 2 0 0 2 2 2 2 ...
##  $ plumb_leak_3_blk                   : int  5 5 5 1 5 4 4 4 1 4 ...
##  $ rep_cost_3_blk                     : int  5 5 5 5 5 4 5 5 5 5 ...
##  $ ustructure_fail_3_blk              : int  5 5 5 3 5 5 5 5 1 5 ...
##  $ waterh_fail_3_blk                  : int  2 2 2 0 0 1 2 0 2 1 ...
##   [list output truncated]
summary(dataset)
##   modeldata_id     systemidstart      systemidend         cal_year   
##  Min.   :      1   Min.   :      2   Min.   :      3   Min.   :2009  
##  1st Qu.: 546307   1st Qu.:1254638   1st Qu.:1353054   1st Qu.:2013  
##  Median :1095214   Median :3292474   Median :3445261   Median :2016  
##  Mean   :1091473   Mean   :3892680   Mean   :3993420   Mean   :2015  
##  3rd Qu.:1637953   3rd Qu.:6415028   3rd Qu.:6591991   3rd Qu.:2018  
##  Max.   :2178777   Max.   :9657386   Max.   :9657502   Max.   :2021  
##                                                                      
##       startdate             enddate                     startdatetm     
##  2019-01-01: 113586   2019-01-01: 113557   2018-06-01 00:00:00:   1707  
##  2018-01-01: 108834   2018-01-01: 108763   2017-09-01 00:00:00:   1557  
##  2020-01-01:  96907   2020-01-01:  96974   2018-07-01 00:00:00:   1547  
##  2017-01-01:  88488   2017-01-01:  88375   2017-07-01 00:00:00:   1508  
##  2016-01-01:  76458   2016-01-01:  76444   2018-08-01 00:00:00:   1454  
##  2013-01-01:  74669   2013-01-01:  74678   2018-09-01 00:00:00:   1435  
##  (Other)   :1436823   (Other)   :1436974   (Other)            :1986557  
##                enddatetm            ecy            log_ecy         
##  2019-06-01 00:00:00:   2013   Min.   :0.0027   Min.   :-5.914504  
##  2018-06-01 00:00:00:   1893   1st Qu.:0.2464   1st Qu.:-1.400799  
##  2019-07-01 00:00:00:   1887   Median :0.4709   Median :-0.753110  
##  2019-08-01 00:00:00:   1842   Mean   :0.4793   Mean   :-1.021357  
##  2018-07-01 00:00:00:   1829   3rd Qu.:0.7091   3rd Qu.:-0.343759  
##  2018-09-01 00:00:00:   1802   Max.   :1.0020   Max.   : 0.001998  
##  (Other)            :1984499                                       
##      policynumber     policy_uniqueid     policyterm       policytype     
##  CAF0389924:    201   Min.   :      1   Min.   : 1.000   New    : 478819  
##  CAF0461789:    192   1st Qu.: 494781   1st Qu.: 2.000   Renewal:1516946  
##  CAF0475516:    168   Median : 959846   Median : 3.000                    
##  CAF0464778:    144   Mean   : 946658   Mean   : 3.792                    
##  CAF0393082:    130   3rd Qu.:1411214   3rd Qu.: 5.000                    
##  CAF0468201:    120   Max.   :1854593   Max.   :13.000                    
##  (Other)   :1994810                                                       
##     effectivedate        expirationdate    policystate 
##  2018-06-01:   2058   2019-06-01:   2058   AZ: 149213  
##  2017-07-01:   1945   2018-07-01:   1943   CA:1779623  
##  2017-09-01:   1923   2018-09-01:   1923   NV:  66929  
##  2018-07-01:   1914   2019-07-01:   1913               
##  2017-06-01:   1901   2018-06-01:   1904               
##  2018-08-01:   1898   2019-08-01:   1892               
##  (Other)   :1984126   (Other)   :1984132               
##        policyform       persistency        companycd      carriercd      
##  HO3        :1013886   Min.   :  0.000   Min.   : 1.00   CSEICO: 688867  
##  DF3        : 815992   1st Qu.:  1.000   1st Qu.: 1.00   CSESG :1306898  
##  DF6        :  84229   Median :  3.000   Median :17.00                   
##  Form3      :  39641   Mean   :  6.068   Mean   :12.09                   
##  FL1-Vacant :  22667   3rd Qu.:  8.000   3rd Qu.:17.00                   
##  FL3-Special:  10316   Max.   :103.000   Max.   :19.00                   
##  (Other)    :   9034                                                     
##                                        agency_group    
##  WESTERN GOLD INS AGCY INC.                  : 236520  
##  J.E. BROWN and ASSOCS INS SVCS              :  87449  
##  PIIB - PACIFIC INTERSTATE INS               :  63855  
##  CRUSBERG DECKER INS SVCS INC                :  61767  
##  ISU INSURANCE SERVICES OF  SAN FRANCISCO INC:  53631  
##  Acrisure of California                      :  47846  
##  (Other)                                     :1444697  
##                          producername       territory     
##  WESTERN GOLD INS AGCY INC.    : 234904   CA-B   :604119  
##  J.E. BROWN and ASSOCS INS SVCS:  82990   CA-C   :410766  
##  CRUSBERG DECKER INS SVCS INC  :  40912   CA-O   :350944  
##  TOMM and BUCK INSURANCE SRVCS :  23611   CA-A   :302099  
##  BICHLMEIER INSURANCE SRVS INC :  23410   AZ-A   :131863  
##  VALLEY WEST FINANCIAL INS SVC :  22406   CA-T   :125226  
##  (Other)                       :1567532   (Other): 70748  
##    risknumber           risktype         yearbuilt    log_yearbuilt  
##  Min.   : 0.000   Dwelling  : 942238   Min.   :1900   Min.   :7.523  
##  1st Qu.: 1.000   Homeowners:1053527   1st Qu.:1959   1st Qu.:7.580  
##  Median : 1.000                        Median :1980   Median :7.591  
##  Mean   : 1.006                        Mean   :1976   Mean   :7.589  
##  3rd Qu.: 1.000                        3rd Qu.:1996   3rd Qu.:7.599  
##  Max.   :16.000                        Max.   :2019   Max.   :7.611  
##                                                                      
##       sqft         log_sqft        stories           roofcd       
##  Min.   : 800   Min.   :6.397   Min.   :1.000   COMPO   :1000077  
##  1st Qu.:1300   1st Qu.:7.178   1st Qu.:1.000   MEMBRANE:  23443  
##  Median :1700   Median :7.439   Median :1.000   METAL   :   5141  
##  Mean   :1874   Mean   :7.471   Mean   :1.208   OTHER   : 234652  
##  3rd Qu.:2300   3rd Qu.:7.750   3rd Qu.:1.000   TAR     :  45170  
##  Max.   :5000   Max.   :9.210   Max.   :3.000   TILE    : 669164  
##                                                 WOOD    :  18118  
##   roofcd_encd       units            occupancycd      occupancy_encd 
##  Min.   :1.00   Min.   :1.000   NO         :      5   Min.   :1.000  
##  1st Qu.:7.00   1st Qu.:1.000   OCCUPIEDNOW:1861796   1st Qu.:1.000  
##  Median :8.00   Median :1.000   TENANT     : 133964   Median :1.000  
##  Mean   :7.23   Mean   :1.125                         Mean   :1.067  
##  3rd Qu.:8.00   3rd Qu.:1.000                         3rd Qu.:1.000  
##  Max.   :8.00   Max.   :4.000                         Max.   :3.000  
##                                                                      
##   allperilded       waterded        protectionclass  constructioncd 
##  Min.   :    0   Min.   :    0.00   Min.   : 0.000   AF   : 805663  
##  1st Qu.: 1000   1st Qu.:    0.00   1st Qu.: 2.000   B    :  22576  
##  Median : 1000   Median :    0.00   Median : 3.000   F    :1121787  
##  Mean   : 1405   Mean   :   95.49   Mean   : 2.945   M    :  22679  
##  3rd Qu.: 2500   3rd Qu.:    0.00   3rd Qu.: 4.000   OTHER:  23060  
##  Max.   :10000   Max.   :10000.00   Max.   :10.000                  
##                                                                     
##  constructioncd_encd fire_risk_model_score multipolicyind  
##  Min.   :1.000       Min.   :-1.0000       Min.   :0.0000  
##  1st Qu.:4.000       1st Qu.: 0.0000       1st Qu.:0.0000  
##  Median :5.000       Median : 0.0000       Median :0.0000  
##  Mean   :4.494       Mean   : 0.2133       Mean   :0.1618  
##  3rd Qu.:5.000       3rd Qu.: 0.0000       3rd Qu.:0.0000  
##  Max.   :5.000       Max.   :18.0000       Max.   :1.0000  
##                                                            
##  multipolicyindumbrella earthquakeumbrellaind      usagetype      
##  Min.   :0.000000       Min.   :0.000000      COC       :  11506  
##  1st Qu.:0.000000       1st Qu.:0.000000      PRIMARY   :1110389  
##  Median :0.000000       Median :0.000000      RENTAL    : 850681  
##  Mean   :0.005219       Mean   :0.004454      SEASONAL  :   8401  
##  3rd Qu.:0.000000       3rd Qu.:0.000000      SECONDARY :   3618  
##  Max.   :1.000000       Max.   :1.000000      UNOCCUPIED:    321  
##                                               VACANT    :  10849  
##  usagetype_encd  ordinanceorlawpct functionalreplacementcost
##  Min.   :1.000   Min.   :  0.000   Min.   :0.000000         
##  1st Qu.:6.000   1st Qu.:  0.000   1st Qu.:0.000000         
##  Median :7.000   Median : 10.000   Median :0.000000         
##  Mean   :6.519   Mean   :  9.894   Mean   :0.001919         
##  3rd Qu.:7.000   3rd Qu.: 10.000   3rd Qu.:0.000000         
##  Max.   :7.000   Max.   :100.000   Max.   :1.000000         
##                                                             
##  homegardcreditind sprinklersystem   landlordind      rentersinsurance  
##  Min.   :0.0000    Min.   :0.0000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.0000    1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.0000    Median :0.0000   Median :0.00000   Median :0.000000  
##  Mean   :0.1413    Mean   :0.0321   Mean   :0.08072   Mean   :0.005927  
##  3rd Qu.:0.0000    3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.0000    Max.   :1.0000   Max.   :1.00000   Max.   :1.000000  
##                                                                         
##  firealarmtype    burglaryalarmtype waterdetectiondevice
##  Min.   :0.0000   Min.   :0.000     Min.   :0.0000000   
##  1st Qu.:0.0000   1st Qu.:0.000     1st Qu.:0.0000000   
##  Median :1.0000   Median :0.000     Median :0.0000000   
##  Mean   :0.6268   Mean   :0.366     Mean   :0.0002465   
##  3rd Qu.:1.0000   3rd Qu.:1.000     3rd Qu.:0.0000000   
##  Max.   :1.0000   Max.   :1.000     Max.   :1.0000000   
##                                                         
##  neighborhoodcrimewatchind propertymanager  safeguardplusind
##  Min.   :0.00000           Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000           Median :0.0000   Median :0.0000  
##  Mean   :0.01384           Mean   :0.0209   Mean   :0.3696  
##  3rd Qu.:0.00000           3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.0000   Max.   :1.0000  
##                                                             
##  kitchenfireextinguisherind gatedcommunityind  deadboltind    
##  Min.   :0.0000             Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000             1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000             Median :0.00000   Median :1.0000  
##  Mean   :0.3976             Mean   :0.01443   Mean   :0.7204  
##  3rd Qu.:1.0000             3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000             Max.   :1.00000   Max.   :1.0000  
##                                                               
##     poolind        replacementcostdwellingind replacementvalueind
##  Min.   :0.00000   Min.   :0.0000             Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.0000             1st Qu.:0.00000    
##  Median :0.00000   Median :0.0000             Median :0.00000    
##  Mean   :0.03582   Mean   :0.3577             Mean   :0.01676    
##  3rd Qu.:0.00000   3rd Qu.:1.0000             3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.0000             Max.   :1.00000    
##                                                                  
##   serviceline     equipmentbreakdown numberoffamilies   insuredage    
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.000    Min.   :  0.0   
##  1st Qu.:0.0000   1st Qu.:0.0000     1st Qu.:1.000    1st Qu.: 46.0   
##  Median :0.0000   Median :0.0000     Median :1.000    Median : 57.0   
##  Mean   :0.1178   Mean   :0.1202     Mean   :1.126    Mean   : 56.4   
##  3rd Qu.:0.0000   3rd Qu.:0.0000     3rd Qu.:1.000    3rd Qu.: 66.0   
##  Max.   :1.0000   Max.   :1.0000     Max.   :4.000    Max.   :177.0   
##                                                       NA's   :236486  
##   maritalstatus    insurancescore    overriddeninsurancescore
##  ~       :794711   ~      :1906604   ~      :1849724         
##  Divorced:  8153   99     :    418   99     :  50147         
##  Married :566894   KQXKD  :    155   07     :   9717         
##  Single  :606747   ZKKWS  :    119   12     :   8082         
##  Widowed : 19260   KFWZV  :    108   04     :   6351         
##                    XXQKS  :    101   06     :   5877         
##                    (Other):  88260   (Other):  65867         
##  insurancescorevalue insscoretiervalueband financialstabilitytier
##  ~      :1994253     ~      :1994253              :1740149       
##  825    :     40     837-865:    168       ~      : 238805       
##  829    :     21     748-774:    147       07     :   1630       
##  881    :     21     865-880:    141       12     :   1394       
##  873    :     20     714-731:    129       04     :   1076       
##  868    :     19     820-837:    129       13     :   1010       
##  (Other):   1391     (Other):    798       (Other):  11701       
##    allcov_wp          cova_wp           cova_ep        cova_deductible
##  Min.   :   32.0   Min.   :    5.0   Min.   :-8666.2   Min.   :    0  
##  1st Qu.:  549.0   1st Qu.:  468.0   1st Qu.:  115.6   1st Qu.: 1000  
##  Median :  778.0   Median :  755.0   Median :  282.6   Median : 1000  
##  Mean   :  911.3   Mean   :  973.1   Mean   :  409.9   Mean   : 1405  
##  3rd Qu.: 1105.0   3rd Qu.: 1230.0   3rd Qu.:  566.5   3rd Qu.: 2500  
##  Max.   :18926.0   Max.   :14620.0   Max.   :12121.9   Max.   :10000  
##                                                                       
##  log_cova_deductible   cova_limit      log_cova_limit   cova_ic_nc_water  
##  Min.   :0.000       Min.   : 100000   Min.   : 8.294   Min.   :0.000000  
##  1st Qu.:6.908       1st Qu.: 300000   1st Qu.:12.385   1st Qu.:0.000000  
##  Median :6.908       Median : 400000   Median :12.702   Median :0.000000  
##  Mean   :7.019       Mean   : 422331   Mean   :12.689   Mean   :0.005966  
##  3rd Qu.:7.824       3rd Qu.: 500000   3rd Qu.:13.036   3rd Qu.:0.000000  
##  Max.   :9.210       Max.   :1300000   Max.   :14.944   Max.   :3.000000  
##                                                                           
##     hasclaim        cova_il_nc_water   log_cova_il_nc_water
##  Min.   :0.000000   Min.   : -5536.9   Min.   :-0.10536    
##  1st Qu.:0.000000   1st Qu.:     0.0   1st Qu.: 0.00000    
##  Median :0.000000   Median :     0.0   Median : 0.00000    
##  Mean   :0.005833   Mean   :    79.4   Mean   : 0.05118    
##  3rd Qu.:0.000000   3rd Qu.:     0.0   3rd Qu.: 0.00000    
##  Max.   :1.000000   Max.   :522735.2   Max.   :13.16683    
##                                        NA's   :3           
##  water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
##  Min.   :   0.0   Min.   :0.000        Min.   :   0        
##  1st Qu.: 134.0   1st Qu.:4.898        1st Qu.: 108        
##  Median : 185.0   Median :5.220        Median : 154        
##  Mean   : 202.1   Mean   :5.189        Mean   : 169        
##  3rd Qu.: 244.0   3rd Qu.:5.497        3rd Qu.: 209        
##  Max.   :1491.0   Max.   :7.307        Max.   :2308        
##                                                            
##  log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
##  Min.   :0.000            Min.   :  0.0        Min.   :0.000           
##  1st Qu.:4.682            1st Qu.:106.0        1st Qu.:4.663           
##  Median :5.037            Median :125.0        Median :4.828           
##  Mean   :5.002            Mean   :127.4        Mean   :4.809           
##  3rd Qu.:5.342            3rd Qu.:147.0        3rd Qu.:4.990           
##  Max.   :7.744            Max.   :313.0        Max.   :5.746           
##                                                                        
##  appl_fail_3_blk  fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
##  Min.   :-1.000   Min.   :-1.000     Min.   :-1.000   Min.   :-1.00   
##  1st Qu.: 4.000   1st Qu.: 1.000     1st Qu.: 2.000   1st Qu.: 1.00   
##  Median : 5.000   Median : 2.000     Median : 2.000   Median : 4.00   
##  Mean   : 4.106   Mean   : 1.877     Mean   : 1.728   Mean   : 3.06   
##  3rd Qu.: 5.000   3rd Qu.: 3.000     3rd Qu.: 2.000   3rd Qu.: 4.00   
##  Max.   : 5.000   Max.   : 5.000     Max.   : 5.000   Max.   : 5.00   
##                                                                       
##  rep_cost_3_blk   ustructure_fail_3_blk waterh_fail_3_blk
##  Min.   :-1.000   Min.   :-1.000        Min.   :-1.000   
##  1st Qu.: 5.000   1st Qu.: 5.000        1st Qu.: 0.000   
##  Median : 5.000   Median : 5.000        Median : 1.000   
##  Mean   : 4.783   Mean   : 4.428        Mean   : 1.149   
##  3rd Qu.: 5.000   3rd Qu.: 5.000        3rd Qu.: 2.000   
##  Max.   : 5.000   Max.   : 5.000        Max.   : 5.000   
##                                                          
##                     loaddate       customer_cnt_active_policies
##  2020-08-07 08:09:37.497:1995765   Min.   :  1.00              
##                                    1st Qu.:  1.00              
##                                    Median :  1.00              
##                                    Mean   :  1.76              
##                                    3rd Qu.:  1.00              
##                                    Max.   :147.00              
##                                                                
##  customer_cnt_active_policies_binned
##  Min.   :  1.000                    
##  1st Qu.:  1.000                    
##  Median :  1.000                    
##  Mean   :  2.939                    
##  3rd Qu.:  1.000                    
##  Max.   :150.000                    
## 

Number of Claims - Poisson Distribution

We can use goodfit() from vcd package. H0: The process is a Poisson process.

gf = goodfit(dataset$cova_ic_nc_water,type= "poisson",method= "ML")
summary(gf)
## 
##   Goodness-of-fit test for poisson distribution
## 
##                      X^2 df      P(> X^2)
## Likelihood Ratio 630.459  2 1.251885e-137

Plot

plot(gf,main="Number of Water Claims data vs Poisson distribution")

P-Value

gf.summary = capture.output(summary(gf))[[5]]
pvalue = unlist(strsplit(gf.summary, split = " "))
pvalue = as.numeric(pvalue[length(pvalue)]) 
pvalue
## [1] 1.251885e-137

P-Value is less then 0.05 and we can reject H0

It is not a Poisson distribution

Number of Claims - Negative Binomial Distribution

gf = goodfit(dataset$cova_ic_nc_water,type= "nbinomial",method= "ML")
summary(gf)
## 
##   Goodness-of-fit test for nbinomial distribution
## 
##                       X^2 df  P(> X^2)
## Likelihood Ratio 1.767831  1 0.1836508
plot(gf,main="Number of Water Claims data vs Negative Binomial distribution")

gf.summary = capture.output(summary(gf))[[5]]
pvalue = unlist(strsplit(gf.summary, split = " "))
pvalue = as.numeric(pvalue[length(pvalue)]) 
pvalue
## [1] 0.1836508

P-Value is larger then 0.05 and we can not reject H0 The distribution is closer to negative binomial

Incurred Losses

data <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)

Positive Losses only

dataset <- data[data$cova_il_nc_water>=100,]

And removing long tail of large losses which is better to Gamma distribution but worse for normal log of losses

dataset <- dataset[dataset$cova_il_nc_water<quantile(dataset$cova_il_nc_water, 0.95),]
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_histogram(bins=100) +
  labs(x = 'cova_il_nc_water', y = 'Count', title = paste("Histogram of", 'cova_il_nc_water'))+
  theme_light()

ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'cova_il_nc_water', y = 'Density', title = paste("Density of", 'cova_il_nc_water'))

  vec <- dataset$cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'cova_il_nc_water', title = paste("QQ Plot of", 'cova_il_nc_water'))

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_histogram(bins=100) +
  labs(x = 'log(cova_il_nc_water)', y = 'Count', title = paste("Histogram of log ", 'cova_il_nc_water'))+
  theme_light()

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of log(cova_il_nc_water)')

  vec <- dataset$log_cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of log(cova_il_nc_water)')

Incurred Losses - Gamma Distribution

https://stackoverflow.com/questions/45536234/how-would-you-fit-a-gamma-distribution-to-a-data-in-r https://www.r-bloggers.com/goodness-of-fit-test-in-r/

plot(dataset$cova_il_nc_water, pch=20)

plotdist(dataset$cova_il_nc_water, histo = TRUE, demp = TRUE)

descdist(dataset$cova_il_nc_water, discrete=FALSE, boot=500)

## summary statistics
## ------
## min:  101.16   max:  46478.7 
## median:  6592.555 
## mean:  10060.89 
## estimated sd:  9921.511 
## estimated skewness:  1.491259 
## estimated kurtosis:  4.749601

Scaling is needed to preven “Error in fitdist(dataset$cova_il_nc_water/10,”gamma“, method =”mle“) : the function mle failed to estimate the parameters, with the error code 100”

https://stackoverflow.com/questions/53557022/error-code-100-fitting-exp-distribution-using-fitdist-in-r

It looks like some numerical stability problem with the underlying algorithm. It hits something indistinguishable from infinity.

fit_w  <- fitdist(dataset$cova_il_nc_water/100, "weibull", method="mle")
fit_g  <- fitdist(dataset$cova_il_nc_water/100, "gamma", method="mle")
fit_ln <- fitdist(dataset$cova_il_nc_water/100, "lnorm", method="mle")
summary(fit_w)
## Fitting of the distribution ' weibull ' by maximum likelihood 
## Parameters : 
##         estimate  Std. Error
## shape   1.013154 0.007503436
## scale 101.189548 1.004411260
## Loglikelihood:  -61834.32   AIC:  123672.6   BIC:  123687.2 
## Correlation matrix:
##           shape     scale
## shape 1.0000000 0.3200784
## scale 0.3200784 1.0000000
summary(fit_g)
## Fitting of the distribution ' gamma ' by maximum likelihood 
## Parameters : 
##         estimate   Std. Error
## shape 1.04059128 0.0122611408
## rate  0.01034088 0.0001534012
## Loglikelihood:  -61830.34   AIC:  123664.7   BIC:  123679.3 
## Correlation matrix:
##           shape      rate
## shape 1.0000000 0.7817832
## rate  0.7817832 1.0000000
summary(fit_ln)
## Fitting of the distribution ' lnorm ' by maximum likelihood 
## Parameters : 
##         estimate  Std. Error
## meanlog 4.059114 0.011257903
## sdlog   1.181812 0.007960514
## Loglikelihood:  -62209.01   AIC:  124422   BIC:  124436.6 
## Correlation matrix:
##         meanlog sdlog
## meanlog       1     0
## sdlog         0     1
plot.legend <- c("Weibull", "gamma", "lognormal")
denscomp(list(fit_w,  fit_g, fit_ln), legendtext = plot.legend)

cdfcomp (list(fit_w,  fit_g, fit_ln), legendtext = plot.legend)

qqcomp  (list(fit_w,  fit_g, fit_ln), legendtext = plot.legend)

ppcomp  (list(fit_w,  fit_g, fit_ln), legendtext = plot.legend)

Gamma and weibull are close to each other. The observed distribution is not good for any. Well, the best from worse is Gamma

Kolmogorov-Smirnov does not work

m <- mean(dataset$cova_il_nc_water)
v <- var(dataset$cova_il_nc_water)

print(m)
## [1] 10060.89
print(v)
## [1] 98436385
scale <- v/m
shape <- m*m/v

print(shape)
## [1] 1.028293
print(1/scale)
## [1] 0.000102207

Shape = 1.024066 Scale = 35519.68

Kolmogorov-Smirnov is simple nonparametric test for one dimensional probability distribution. Same as Cramer von Mises test, it compares empirical distribution with reference probability.

num_of_samples = 10000
y <- rgamma(num_of_samples, shape = shape, scale = 1/scale)
ks.test(dataset$cova_il_nc_water, y)
## Warning in ks.test(dataset$cova_il_nc_water, y): p-value will be
## approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  dataset$cova_il_nc_water and y
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided

Log of Incurred Losses - Normal Distribution

dataset <- data[data$cova_il_nc_water>100,]
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_histogram(bins=100) +
  labs(x = 'cova_il_nc_water', y = 'Count', title = paste("Histogram of", 'cova_il_nc_water'))+
  theme_light()

ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'cova_il_nc_water', y = 'Density', title = paste("Density of", 'cova_il_nc_water'))

  vec <- dataset$cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'cova_il_nc_water', title = paste("QQ Plot of", 'cova_il_nc_water'))

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_histogram(bins=100) +
  labs(x = 'log(cova_il_nc_water)', y = 'Count', title = paste("Histogram of log ", 'cova_il_nc_water'))+
  theme_light()

ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
  geom_density() + 
  labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of log(cova_il_nc_water)')

  vec <- dataset$log_cova_il_nc_water
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]

ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) + 
  stat_qq() + 
  geom_abline(slope = slope, intercept = int) +
  theme(legend.position = "none") +
  labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of log(cova_il_nc_water)')

Shapiro test

shapiro.test(dataset[sample(nrow(dataset), 5000), 'log_cova_il_nc_water'])
## 
##  Shapiro-Wilk normality test
## 
## data:  dataset[sample(nrow(dataset), 5000), "log_cova_il_nc_water"]
## W = 0.99235, p-value = 9.133e-16

The p-value > 0.05 implying that the distribution of the data are not significantly different from normal distribution. In other words, we can NOT assume the normality.

plot(dataset$log_cova_il_nc_water, pch=20)

plotdist(dataset$log_cova_il_nc_water, histo = TRUE, demp = TRUE)

descdist(dataset$log_cova_il_nc_water, discrete=FALSE, boot=500)

## summary statistics
## ------
## min:  4.616703   max:  13.16683 
## median:  8.869405 
## mean:  8.791553 
## estimated sd:  1.281621 
## estimated skewness:  -0.3127817 
## estimated kurtosis:  3.078105
fit_n  <- fitdist(dataset$log_cova_il_nc_water, "norm", method="mle")
summary(fit_n)
## Fitting of the distribution ' norm ' by maximum likelihood 
## Parameters : 
##      estimate  Std. Error
## mean 8.791553 0.011898529
## sd   1.281566 0.008413507
## Loglikelihood:  -19339.11   AIC:  38682.23   BIC:  38696.95 
## Correlation matrix:
##      mean sd
## mean    1  0
## sd      0  1
plot.legend <- c("Normal")
denscomp(list(fit_n), legendtext = plot.legend)

cdfcomp (list(fit_n), legendtext = plot.legend)