dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
library(vcd)
## Loading required package: grid
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(fitdistrplus)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.5.3
## Loading required package: survival
colnames(dataset)
## [1] "modeldata_id"
## [2] "systemidstart"
## [3] "systemidend"
## [4] "cal_year"
## [5] "startdate"
## [6] "enddate"
## [7] "startdatetm"
## [8] "enddatetm"
## [9] "ecy"
## [10] "log_ecy"
## [11] "policynumber"
## [12] "policy_uniqueid"
## [13] "policyterm"
## [14] "policytype"
## [15] "effectivedate"
## [16] "expirationdate"
## [17] "policystate"
## [18] "policyform"
## [19] "persistency"
## [20] "companycd"
## [21] "carriercd"
## [22] "agency_group"
## [23] "producername"
## [24] "territory"
## [25] "risknumber"
## [26] "risktype"
## [27] "yearbuilt"
## [28] "log_yearbuilt"
## [29] "sqft"
## [30] "log_sqft"
## [31] "stories"
## [32] "roofcd"
## [33] "roofcd_encd"
## [34] "units"
## [35] "occupancycd"
## [36] "occupancy_encd"
## [37] "allperilded"
## [38] "waterded"
## [39] "protectionclass"
## [40] "constructioncd"
## [41] "constructioncd_encd"
## [42] "fire_risk_model_score"
## [43] "multipolicyind"
## [44] "multipolicyindumbrella"
## [45] "earthquakeumbrellaind"
## [46] "usagetype"
## [47] "usagetype_encd"
## [48] "ordinanceorlawpct"
## [49] "functionalreplacementcost"
## [50] "homegardcreditind"
## [51] "sprinklersystem"
## [52] "landlordind"
## [53] "rentersinsurance"
## [54] "firealarmtype"
## [55] "burglaryalarmtype"
## [56] "waterdetectiondevice"
## [57] "neighborhoodcrimewatchind"
## [58] "propertymanager"
## [59] "safeguardplusind"
## [60] "kitchenfireextinguisherind"
## [61] "gatedcommunityind"
## [62] "deadboltind"
## [63] "poolind"
## [64] "replacementcostdwellingind"
## [65] "replacementvalueind"
## [66] "serviceline"
## [67] "equipmentbreakdown"
## [68] "numberoffamilies"
## [69] "insuredage"
## [70] "maritalstatus"
## [71] "insurancescore"
## [72] "overriddeninsurancescore"
## [73] "insurancescorevalue"
## [74] "insscoretiervalueband"
## [75] "financialstabilitytier"
## [76] "allcov_wp"
## [77] "cova_wp"
## [78] "cova_ep"
## [79] "cova_deductible"
## [80] "log_cova_deductible"
## [81] "cova_limit"
## [82] "log_cova_limit"
## [83] "cova_ic_nc_water"
## [84] "hasclaim"
## [85] "cova_il_nc_water"
## [86] "log_cova_il_nc_water"
## [87] "water_risk_3_blk"
## [88] "log_water_risk_3_blk"
## [89] "water_risk_fre_3_blk"
## [90] "log_water_risk_fre_3_blk"
## [91] "water_risk_sev_3_blk"
## [92] "log_water_risk_sev_3_blk"
## [93] "appl_fail_3_blk"
## [94] "fixture_leak_3_blk"
## [95] "pipe_froze_3_blk"
## [96] "plumb_leak_3_blk"
## [97] "rep_cost_3_blk"
## [98] "ustructure_fail_3_blk"
## [99] "waterh_fail_3_blk"
## [100] "loaddate"
## [101] "customer_cnt_active_policies"
## [102] "customer_cnt_active_policies_binned"
str(dataset)
## 'data.frame': 1995765 obs. of 102 variables:
## $ modeldata_id : int 534254 1793227 880091 1653118 353309 1289956 305141 1115709 2075469 1924235 ...
## $ systemidstart : int 1219556 7403270 2370713 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
## $ systemidend : int 1219556 7403270 2955483 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
## $ cal_year : int 2013 2018 2014 2018 2011 2016 2012 2015 2020 2020 ...
## $ startdate : Factor w/ 4290 levels "2009-01-01","2009-01-02",..: 1462 3611 2067 3424 1030 2878 1096 2544 4093 4018 ...
## $ enddate : Factor w/ 4620 levels "2009-01-09","2009-01-12",..: 1553 3620 2159 3431 1044 2890 1225 2524 4351 4156 ...
## $ startdatetm : Factor w/ 23752 levels "2008-01-09 00:00:00",..: 6832 20120 11523 19086 5616 15738 4639 13893 22823 21287 ...
## $ enddatetm : Factor w/ 16484 levels "2009-01-09 00:00:00",..: 5396 14951 8350 12631 3384 11614 4105 10088 16289 15885 ...
## $ ecy : num 0.339 0.115 0.342 0.11 0.129 ...
## $ log_ecy : num -1.08 -2.16 -1.07 -2.21 -2.05 ...
## $ policynumber : Factor w/ 247043 levels "AZF0082147","AZF0221975",..: 33600 194123 187155 140475 2424 15381 185108 164391 100140 201424 ...
## $ policy_uniqueid : int 445056 1541369 786905 1427784 348649 1124000 292060 976554 1776548 1654555 ...
## $ policyterm : int 4 7 4 10 1 3 1 7 4 6 ...
## $ policytype : Factor w/ 2 levels "New","Renewal": 2 2 2 2 1 2 1 2 2 2 ...
## $ effectivedate : Factor w/ 4331 levels "2008-01-09","2008-01-14",..: 1264 3654 2110 3467 1073 2921 935 2587 4136 3866 ...
## $ expirationdate : Factor w/ 4451 levels "2009-01-09","2009-01-14",..: 1385 3775 2231 3588 1195 3042 1057 2709 4256 3987 ...
## $ policystate : Factor w/ 3 levels "AZ","CA","NV": 2 2 2 2 1 1 2 2 2 2 ...
## $ policyform : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 9 9 9 2 9 9 9 2 9 ...
## $ persistency : int 3 6 3 25 0 2 0 7 3 5 ...
## $ companycd : int 1 17 17 1 1 16 17 17 17 1 ...
## $ carriercd : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 1 2 2 2 1 ...
## $ agency_group : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 446 582 89 50 265 265 378 426 426 426 ...
## $ producername : Factor w/ 1271 levels "1ST CENTURY INS SVCS INC.",..: 983 1226 142 78 452 452 739 889 948 948 ...
## $ territory : Factor w/ 10 levels "","AZ-A","AZ-T",..: 8 6 6 4 6 6 8 7 7 7 ...
## $ risknumber : int 1 1 1 1 1 1 1 1 1 1 ...
## $ risktype : Factor w/ 2 levels "Dwelling","Homeowners": 1 2 2 2 1 2 2 2 1 2 ...
## $ yearbuilt : int 1986 1986 1994 1951 1974 1997 1990 1958 2005 1984 ...
## $ log_yearbuilt : num 7.59 7.59 7.6 7.58 7.59 ...
## $ sqft : int 1500 2200 2600 1100 1200 1400 1700 1100 2700 1300 ...
## $ log_sqft : num 7.33 7.73 7.87 7.04 7.12 ...
## $ stories : int 1 2 2 1 1 1 1 1 2 2 ...
## $ roofcd : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 4 1 1 6 1 6 1 ...
## $ roofcd_encd : int 7 7 7 6 8 8 7 8 7 8 ...
## $ units : int 1 1 1 1 1 1 1 1 1 1 ...
## $ occupancycd : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ occupancy_encd : int 1 1 1 1 1 1 1 1 1 1 ...
## $ allperilded : int 500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
## $ waterded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protectionclass : int 3 2 3 3 2 2 4 3 4 4 ...
## $ constructioncd : Factor w/ 5 levels "AF","B","F","M",..: 3 1 1 3 2 5 3 3 1 1 ...
## $ constructioncd_encd : int 5 4 4 5 3 2 5 5 4 4 ...
## $ fire_risk_model_score : int 0 2 0 0 -1 -1 2 0 1 2 ...
## $ multipolicyind : int 0 0 0 1 0 0 1 0 0 0 ...
## $ multipolicyindumbrella : int 0 0 0 0 0 0 0 0 0 0 ...
## $ earthquakeumbrellaind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ usagetype : Factor w/ 7 levels "COC","PRIMARY",..: 3 2 2 2 3 2 2 2 3 2 ...
## $ usagetype_encd : int 6 7 7 7 6 7 7 7 6 7 ...
## $ ordinanceorlawpct : int 10 20 0 10 0 0 0 0 10 10 ...
## $ functionalreplacementcost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ homegardcreditind : int 0 0 0 0 0 0 0 0 0 1 ...
## $ sprinklersystem : int 0 0 1 0 0 0 0 0 0 0 ...
## $ landlordind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rentersinsurance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ firealarmtype : int 0 0 1 1 1 1 1 0 1 1 ...
## $ burglaryalarmtype : int 0 0 0 0 0 0 0 0 1 0 ...
## $ waterdetectiondevice : int 0 0 0 0 0 0 0 0 0 0 ...
## $ neighborhoodcrimewatchind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ propertymanager : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safeguardplusind : int 0 1 0 0 0 0 0 1 1 0 ...
## $ kitchenfireextinguisherind : int 1 1 1 0 0 0 0 1 0 0 ...
## $ gatedcommunityind : int 0 0 0 0 0 0 0 1 0 0 ...
## $ deadboltind : int 1 1 1 0 1 1 1 1 1 1 ...
## $ poolind : int 0 1 0 0 0 0 0 0 0 0 ...
## $ replacementcostdwellingind : int 0 0 0 0 1 0 0 0 1 0 ...
## $ replacementvalueind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ serviceline : int 0 1 0 0 0 0 0 0 0 0 ...
## $ equipmentbreakdown : int 0 1 0 0 0 0 0 0 0 0 ...
## $ numberoffamilies : int 1 1 1 1 1 1 1 1 1 1 ...
## $ insuredage : int 66 61 39 92 79 36 72 59 55 69 ...
## $ maritalstatus : Factor w/ 5 levels "~","Divorced",..: 1 3 3 1 4 4 4 1 3 3 ...
## $ insurancescore : Factor w/ 3850 levels "(DOES","~","610",..: 2 2 2 2 2 3817 2 2 2 2 ...
## $ overriddeninsurancescore : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 13 1 1 1 1 ...
## $ insurancescorevalue : Factor w/ 262 levels "~","568","602",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insscoretiervalueband : Factor w/ 29 levels "~","559-599",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ financialstabilitytier : Factor w/ 42 levels "","~","01","02",..: 1 2 1 1 1 1 1 1 2 2 ...
## $ allcov_wp : num 1029 1938 915 915 336 ...
## $ cova_wp : num 1029 1342 1732 1721 282 ...
## $ cova_ep : num 349.2 154.2 592.7 188.4 36.3 ...
## $ cova_deductible : int 500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
## $ log_cova_deductible : num 6.21 6.91 7.82 6.91 6.91 ...
## $ cova_limit : int 600000 600000 600000 400000 200000 200000 300000 300000 700000 400000 ...
## $ log_cova_limit : num 13.2 13.3 13.2 12.7 11.7 ...
## $ cova_ic_nc_water : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hasclaim : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water_risk_3_blk : int 230 329 213 94 234 183 329 128 171 237 ...
## $ log_water_risk_3_blk : num 5.44 5.8 5.36 4.54 5.46 ...
## $ water_risk_fre_3_blk : int 152 218 134 53 256 226 218 83 159 180 ...
## $ log_water_risk_fre_3_blk : num 5.02 5.38 4.9 3.97 5.55 ...
## $ water_risk_sev_3_blk : int 156 155 163 182 94 83 155 159 110 136 ...
## $ log_water_risk_sev_3_blk : num 5.05 5.04 5.09 5.2 4.54 ...
## $ appl_fail_3_blk : int 5 4 1 5 5 5 5 2 5 5 ...
## $ fixture_leak_3_blk : int 4 2 1 0 3 5 2 2 0 1 ...
## $ pipe_froze_3_blk : int 0 0 0 2 0 0 2 2 2 2 ...
## $ plumb_leak_3_blk : int 5 5 5 1 5 4 4 4 1 4 ...
## $ rep_cost_3_blk : int 5 5 5 5 5 4 5 5 5 5 ...
## $ ustructure_fail_3_blk : int 5 5 5 3 5 5 5 5 1 5 ...
## $ waterh_fail_3_blk : int 2 2 2 0 0 1 2 0 2 1 ...
## [list output truncated]
summary(dataset)
## modeldata_id systemidstart systemidend cal_year
## Min. : 1 Min. : 2 Min. : 3 Min. :2009
## 1st Qu.: 546307 1st Qu.:1254638 1st Qu.:1353054 1st Qu.:2013
## Median :1095214 Median :3292474 Median :3445261 Median :2016
## Mean :1091473 Mean :3892680 Mean :3993420 Mean :2015
## 3rd Qu.:1637953 3rd Qu.:6415028 3rd Qu.:6591991 3rd Qu.:2018
## Max. :2178777 Max. :9657386 Max. :9657502 Max. :2021
##
## startdate enddate startdatetm
## 2019-01-01: 113586 2019-01-01: 113557 2018-06-01 00:00:00: 1707
## 2018-01-01: 108834 2018-01-01: 108763 2017-09-01 00:00:00: 1557
## 2020-01-01: 96907 2020-01-01: 96974 2018-07-01 00:00:00: 1547
## 2017-01-01: 88488 2017-01-01: 88375 2017-07-01 00:00:00: 1508
## 2016-01-01: 76458 2016-01-01: 76444 2018-08-01 00:00:00: 1454
## 2013-01-01: 74669 2013-01-01: 74678 2018-09-01 00:00:00: 1435
## (Other) :1436823 (Other) :1436974 (Other) :1986557
## enddatetm ecy log_ecy
## 2019-06-01 00:00:00: 2013 Min. :0.0027 Min. :-5.914504
## 2018-06-01 00:00:00: 1893 1st Qu.:0.2464 1st Qu.:-1.400799
## 2019-07-01 00:00:00: 1887 Median :0.4709 Median :-0.753110
## 2019-08-01 00:00:00: 1842 Mean :0.4793 Mean :-1.021357
## 2018-07-01 00:00:00: 1829 3rd Qu.:0.7091 3rd Qu.:-0.343759
## 2018-09-01 00:00:00: 1802 Max. :1.0020 Max. : 0.001998
## (Other) :1984499
## policynumber policy_uniqueid policyterm policytype
## CAF0389924: 201 Min. : 1 Min. : 1.000 New : 478819
## CAF0461789: 192 1st Qu.: 494781 1st Qu.: 2.000 Renewal:1516946
## CAF0475516: 168 Median : 959846 Median : 3.000
## CAF0464778: 144 Mean : 946658 Mean : 3.792
## CAF0393082: 130 3rd Qu.:1411214 3rd Qu.: 5.000
## CAF0468201: 120 Max. :1854593 Max. :13.000
## (Other) :1994810
## effectivedate expirationdate policystate
## 2018-06-01: 2058 2019-06-01: 2058 AZ: 149213
## 2017-07-01: 1945 2018-07-01: 1943 CA:1779623
## 2017-09-01: 1923 2018-09-01: 1923 NV: 66929
## 2018-07-01: 1914 2019-07-01: 1913
## 2017-06-01: 1901 2018-06-01: 1904
## 2018-08-01: 1898 2019-08-01: 1892
## (Other) :1984126 (Other) :1984132
## policyform persistency companycd carriercd
## HO3 :1013886 Min. : 0.000 Min. : 1.00 CSEICO: 688867
## DF3 : 815992 1st Qu.: 1.000 1st Qu.: 1.00 CSESG :1306898
## DF6 : 84229 Median : 3.000 Median :17.00
## Form3 : 39641 Mean : 6.068 Mean :12.09
## FL1-Vacant : 22667 3rd Qu.: 8.000 3rd Qu.:17.00
## FL3-Special: 10316 Max. :103.000 Max. :19.00
## (Other) : 9034
## agency_group
## WESTERN GOLD INS AGCY INC. : 236520
## J.E. BROWN and ASSOCS INS SVCS : 87449
## PIIB - PACIFIC INTERSTATE INS : 63855
## CRUSBERG DECKER INS SVCS INC : 61767
## ISU INSURANCE SERVICES OF SAN FRANCISCO INC: 53631
## Acrisure of California : 47846
## (Other) :1444697
## producername territory
## WESTERN GOLD INS AGCY INC. : 234904 CA-B :604119
## J.E. BROWN and ASSOCS INS SVCS: 82990 CA-C :410766
## CRUSBERG DECKER INS SVCS INC : 40912 CA-O :350944
## TOMM and BUCK INSURANCE SRVCS : 23611 CA-A :302099
## BICHLMEIER INSURANCE SRVS INC : 23410 AZ-A :131863
## VALLEY WEST FINANCIAL INS SVC : 22406 CA-T :125226
## (Other) :1567532 (Other): 70748
## risknumber risktype yearbuilt log_yearbuilt
## Min. : 0.000 Dwelling : 942238 Min. :1900 Min. :7.523
## 1st Qu.: 1.000 Homeowners:1053527 1st Qu.:1959 1st Qu.:7.580
## Median : 1.000 Median :1980 Median :7.591
## Mean : 1.006 Mean :1976 Mean :7.589
## 3rd Qu.: 1.000 3rd Qu.:1996 3rd Qu.:7.599
## Max. :16.000 Max. :2019 Max. :7.611
##
## sqft log_sqft stories roofcd
## Min. : 800 Min. :6.397 Min. :1.000 COMPO :1000077
## 1st Qu.:1300 1st Qu.:7.178 1st Qu.:1.000 MEMBRANE: 23443
## Median :1700 Median :7.439 Median :1.000 METAL : 5141
## Mean :1874 Mean :7.471 Mean :1.208 OTHER : 234652
## 3rd Qu.:2300 3rd Qu.:7.750 3rd Qu.:1.000 TAR : 45170
## Max. :5000 Max. :9.210 Max. :3.000 TILE : 669164
## WOOD : 18118
## roofcd_encd units occupancycd occupancy_encd
## Min. :1.00 Min. :1.000 NO : 5 Min. :1.000
## 1st Qu.:7.00 1st Qu.:1.000 OCCUPIEDNOW:1861796 1st Qu.:1.000
## Median :8.00 Median :1.000 TENANT : 133964 Median :1.000
## Mean :7.23 Mean :1.125 Mean :1.067
## 3rd Qu.:8.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :8.00 Max. :4.000 Max. :3.000
##
## allperilded waterded protectionclass constructioncd
## Min. : 0 Min. : 0.00 Min. : 0.000 AF : 805663
## 1st Qu.: 1000 1st Qu.: 0.00 1st Qu.: 2.000 B : 22576
## Median : 1000 Median : 0.00 Median : 3.000 F :1121787
## Mean : 1405 Mean : 95.49 Mean : 2.945 M : 22679
## 3rd Qu.: 2500 3rd Qu.: 0.00 3rd Qu.: 4.000 OTHER: 23060
## Max. :10000 Max. :10000.00 Max. :10.000
##
## constructioncd_encd fire_risk_model_score multipolicyind
## Min. :1.000 Min. :-1.0000 Min. :0.0000
## 1st Qu.:4.000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :5.000 Median : 0.0000 Median :0.0000
## Mean :4.494 Mean : 0.2133 Mean :0.1618
## 3rd Qu.:5.000 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :5.000 Max. :18.0000 Max. :1.0000
##
## multipolicyindumbrella earthquakeumbrellaind usagetype
## Min. :0.000000 Min. :0.000000 COC : 11506
## 1st Qu.:0.000000 1st Qu.:0.000000 PRIMARY :1110389
## Median :0.000000 Median :0.000000 RENTAL : 850681
## Mean :0.005219 Mean :0.004454 SEASONAL : 8401
## 3rd Qu.:0.000000 3rd Qu.:0.000000 SECONDARY : 3618
## Max. :1.000000 Max. :1.000000 UNOCCUPIED: 321
## VACANT : 10849
## usagetype_encd ordinanceorlawpct functionalreplacementcost
## Min. :1.000 Min. : 0.000 Min. :0.000000
## 1st Qu.:6.000 1st Qu.: 0.000 1st Qu.:0.000000
## Median :7.000 Median : 10.000 Median :0.000000
## Mean :6.519 Mean : 9.894 Mean :0.001919
## 3rd Qu.:7.000 3rd Qu.: 10.000 3rd Qu.:0.000000
## Max. :7.000 Max. :100.000 Max. :1.000000
##
## homegardcreditind sprinklersystem landlordind rentersinsurance
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.000000
## Mean :0.1413 Mean :0.0321 Mean :0.08072 Mean :0.005927
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.000000
##
## firealarmtype burglaryalarmtype waterdetectiondevice
## Min. :0.0000 Min. :0.000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000000
## Median :1.0000 Median :0.000 Median :0.0000000
## Mean :0.6268 Mean :0.366 Mean :0.0002465
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000000
## Max. :1.0000 Max. :1.000 Max. :1.0000000
##
## neighborhoodcrimewatchind propertymanager safeguardplusind
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.01384 Mean :0.0209 Mean :0.3696
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## kitchenfireextinguisherind gatedcommunityind deadboltind
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.3976 Mean :0.01443 Mean :0.7204
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## poolind replacementcostdwellingind replacementvalueind
## Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.03582 Mean :0.3577 Mean :0.01676
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.00000
##
## serviceline equipmentbreakdown numberoffamilies insuredage
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.: 46.0
## Median :0.0000 Median :0.0000 Median :1.000 Median : 57.0
## Mean :0.1178 Mean :0.1202 Mean :1.126 Mean : 56.4
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.: 66.0
## Max. :1.0000 Max. :1.0000 Max. :4.000 Max. :177.0
## NA's :236486
## maritalstatus insurancescore overriddeninsurancescore
## ~ :794711 ~ :1906604 ~ :1849724
## Divorced: 8153 99 : 418 99 : 50147
## Married :566894 KQXKD : 155 07 : 9717
## Single :606747 ZKKWS : 119 12 : 8082
## Widowed : 19260 KFWZV : 108 04 : 6351
## XXQKS : 101 06 : 5877
## (Other): 88260 (Other): 65867
## insurancescorevalue insscoretiervalueband financialstabilitytier
## ~ :1994253 ~ :1994253 :1740149
## 825 : 40 837-865: 168 ~ : 238805
## 829 : 21 748-774: 147 07 : 1630
## 881 : 21 865-880: 141 12 : 1394
## 873 : 20 714-731: 129 04 : 1076
## 868 : 19 820-837: 129 13 : 1010
## (Other): 1391 (Other): 798 (Other): 11701
## allcov_wp cova_wp cova_ep cova_deductible
## Min. : 32.0 Min. : 5.0 Min. :-8666.2 Min. : 0
## 1st Qu.: 549.0 1st Qu.: 468.0 1st Qu.: 115.6 1st Qu.: 1000
## Median : 778.0 Median : 755.0 Median : 282.6 Median : 1000
## Mean : 911.3 Mean : 973.1 Mean : 409.9 Mean : 1405
## 3rd Qu.: 1105.0 3rd Qu.: 1230.0 3rd Qu.: 566.5 3rd Qu.: 2500
## Max. :18926.0 Max. :14620.0 Max. :12121.9 Max. :10000
##
## log_cova_deductible cova_limit log_cova_limit cova_ic_nc_water
## Min. :0.000 Min. : 100000 Min. : 8.294 Min. :0.000000
## 1st Qu.:6.908 1st Qu.: 300000 1st Qu.:12.385 1st Qu.:0.000000
## Median :6.908 Median : 400000 Median :12.702 Median :0.000000
## Mean :7.019 Mean : 422331 Mean :12.689 Mean :0.005966
## 3rd Qu.:7.824 3rd Qu.: 500000 3rd Qu.:13.036 3rd Qu.:0.000000
## Max. :9.210 Max. :1300000 Max. :14.944 Max. :3.000000
##
## hasclaim cova_il_nc_water log_cova_il_nc_water
## Min. :0.000000 Min. : -5536.9 Min. :-0.10536
## 1st Qu.:0.000000 1st Qu.: 0.0 1st Qu.: 0.00000
## Median :0.000000 Median : 0.0 Median : 0.00000
## Mean :0.005833 Mean : 79.4 Mean : 0.05118
## 3rd Qu.:0.000000 3rd Qu.: 0.0 3rd Qu.: 0.00000
## Max. :1.000000 Max. :522735.2 Max. :13.16683
## NA's :3
## water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
## Min. : 0.0 Min. :0.000 Min. : 0
## 1st Qu.: 134.0 1st Qu.:4.898 1st Qu.: 108
## Median : 185.0 Median :5.220 Median : 154
## Mean : 202.1 Mean :5.189 Mean : 169
## 3rd Qu.: 244.0 3rd Qu.:5.497 3rd Qu.: 209
## Max. :1491.0 Max. :7.307 Max. :2308
##
## log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
## Min. :0.000 Min. : 0.0 Min. :0.000
## 1st Qu.:4.682 1st Qu.:106.0 1st Qu.:4.663
## Median :5.037 Median :125.0 Median :4.828
## Mean :5.002 Mean :127.4 Mean :4.809
## 3rd Qu.:5.342 3rd Qu.:147.0 3rd Qu.:4.990
## Max. :7.744 Max. :313.0 Max. :5.746
##
## appl_fail_3_blk fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
## Min. :-1.000 Min. :-1.000 Min. :-1.000 Min. :-1.00
## 1st Qu.: 4.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median : 5.000 Median : 2.000 Median : 2.000 Median : 4.00
## Mean : 4.106 Mean : 1.877 Mean : 1.728 Mean : 3.06
## 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 2.000 3rd Qu.: 4.00
## Max. : 5.000 Max. : 5.000 Max. : 5.000 Max. : 5.00
##
## rep_cost_3_blk ustructure_fail_3_blk waterh_fail_3_blk
## Min. :-1.000 Min. :-1.000 Min. :-1.000
## 1st Qu.: 5.000 1st Qu.: 5.000 1st Qu.: 0.000
## Median : 5.000 Median : 5.000 Median : 1.000
## Mean : 4.783 Mean : 4.428 Mean : 1.149
## 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 2.000
## Max. : 5.000 Max. : 5.000 Max. : 5.000
##
## loaddate customer_cnt_active_policies
## 2020-08-07 08:09:37.497:1995765 Min. : 1.00
## 1st Qu.: 1.00
## Median : 1.00
## Mean : 1.76
## 3rd Qu.: 1.00
## Max. :147.00
##
## customer_cnt_active_policies_binned
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 2.939
## 3rd Qu.: 1.000
## Max. :150.000
##
We can use goodfit() from vcd package. H0: The process is a Poisson process.
gf = goodfit(dataset$cova_ic_nc_water,type= "poisson",method= "ML")
summary(gf)
##
## Goodness-of-fit test for poisson distribution
##
## X^2 df P(> X^2)
## Likelihood Ratio 630.459 2 1.251885e-137
Plot
plot(gf,main="Number of Water Claims data vs Poisson distribution")
P-Value
gf.summary = capture.output(summary(gf))[[5]]
pvalue = unlist(strsplit(gf.summary, split = " "))
pvalue = as.numeric(pvalue[length(pvalue)])
pvalue
## [1] 1.251885e-137
P-Value is less then 0.05 and we can reject H0
It is not a Poisson distribution
gf = goodfit(dataset$cova_ic_nc_water,type= "nbinomial",method= "ML")
summary(gf)
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Likelihood Ratio 1.767831 1 0.1836508
plot(gf,main="Number of Water Claims data vs Negative Binomial distribution")
gf.summary = capture.output(summary(gf))[[5]]
pvalue = unlist(strsplit(gf.summary, split = " "))
pvalue = as.numeric(pvalue[length(pvalue)])
pvalue
## [1] 0.1836508
P-Value is larger then 0.05 and we can not reject H0 The distribution is closer to negative binomial
data <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
Positive Losses only
dataset <- data[data$cova_il_nc_water>=100,]
And removing long tail of large losses which is better to Gamma distribution but worse for normal log of losses
dataset <- dataset[dataset$cova_il_nc_water<quantile(dataset$cova_il_nc_water, 0.95),]
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_histogram(bins=100) +
labs(x = 'cova_il_nc_water', y = 'Count', title = paste("Histogram of", 'cova_il_nc_water'))+
theme_light()
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_density() +
labs(x = 'cova_il_nc_water', y = 'Density', title = paste("Density of", 'cova_il_nc_water'))
vec <- dataset$cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'cova_il_nc_water', title = paste("QQ Plot of", 'cova_il_nc_water'))
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_histogram(bins=100) +
labs(x = 'log(cova_il_nc_water)', y = 'Count', title = paste("Histogram of log ", 'cova_il_nc_water'))+
theme_light()
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_density() +
labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of log(cova_il_nc_water)')
vec <- dataset$log_cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of log(cova_il_nc_water)')
https://stackoverflow.com/questions/45536234/how-would-you-fit-a-gamma-distribution-to-a-data-in-r https://www.r-bloggers.com/goodness-of-fit-test-in-r/
plot(dataset$cova_il_nc_water, pch=20)
plotdist(dataset$cova_il_nc_water, histo = TRUE, demp = TRUE)
descdist(dataset$cova_il_nc_water, discrete=FALSE, boot=500)
## summary statistics
## ------
## min: 101.16 max: 46478.7
## median: 6592.555
## mean: 10060.89
## estimated sd: 9921.511
## estimated skewness: 1.491259
## estimated kurtosis: 4.749601
Scaling is needed to preven “Error in fitdist(dataset$cova_il_nc_water/10,”gamma“, method =”mle“) : the function mle failed to estimate the parameters, with the error code 100”
It looks like some numerical stability problem with the underlying algorithm. It hits something indistinguishable from infinity.
fit_w <- fitdist(dataset$cova_il_nc_water/100, "weibull", method="mle")
fit_g <- fitdist(dataset$cova_il_nc_water/100, "gamma", method="mle")
fit_ln <- fitdist(dataset$cova_il_nc_water/100, "lnorm", method="mle")
summary(fit_w)
## Fitting of the distribution ' weibull ' by maximum likelihood
## Parameters :
## estimate Std. Error
## shape 1.013154 0.007503436
## scale 101.189548 1.004411260
## Loglikelihood: -61834.32 AIC: 123672.6 BIC: 123687.2
## Correlation matrix:
## shape scale
## shape 1.0000000 0.3200784
## scale 0.3200784 1.0000000
summary(fit_g)
## Fitting of the distribution ' gamma ' by maximum likelihood
## Parameters :
## estimate Std. Error
## shape 1.04059128 0.0122611408
## rate 0.01034088 0.0001534012
## Loglikelihood: -61830.34 AIC: 123664.7 BIC: 123679.3
## Correlation matrix:
## shape rate
## shape 1.0000000 0.7817832
## rate 0.7817832 1.0000000
summary(fit_ln)
## Fitting of the distribution ' lnorm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## meanlog 4.059114 0.011257903
## sdlog 1.181812 0.007960514
## Loglikelihood: -62209.01 AIC: 124422 BIC: 124436.6
## Correlation matrix:
## meanlog sdlog
## meanlog 1 0
## sdlog 0 1
plot.legend <- c("Weibull", "gamma", "lognormal")
denscomp(list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
cdfcomp (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
qqcomp (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
ppcomp (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
Gamma and weibull are close to each other. The observed distribution is not good for any. Well, the best from worse is Gamma
m <- mean(dataset$cova_il_nc_water)
v <- var(dataset$cova_il_nc_water)
print(m)
## [1] 10060.89
print(v)
## [1] 98436385
scale <- v/m
shape <- m*m/v
print(shape)
## [1] 1.028293
print(1/scale)
## [1] 0.000102207
Shape = 1.024066 Scale = 35519.68
Kolmogorov-Smirnov is simple nonparametric test for one dimensional probability distribution. Same as Cramer von Mises test, it compares empirical distribution with reference probability.
num_of_samples = 10000
y <- rgamma(num_of_samples, shape = shape, scale = 1/scale)
ks.test(dataset$cova_il_nc_water, y)
## Warning in ks.test(dataset$cova_il_nc_water, y): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: dataset$cova_il_nc_water and y
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
dataset <- data[data$cova_il_nc_water>100,]
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_histogram(bins=100) +
labs(x = 'cova_il_nc_water', y = 'Count', title = paste("Histogram of", 'cova_il_nc_water'))+
theme_light()
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_density() +
labs(x = 'cova_il_nc_water', y = 'Density', title = paste("Density of", 'cova_il_nc_water'))
vec <- dataset$cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'cova_il_nc_water', title = paste("QQ Plot of", 'cova_il_nc_water'))
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_histogram(bins=100) +
labs(x = 'log(cova_il_nc_water)', y = 'Count', title = paste("Histogram of log ", 'cova_il_nc_water'))+
theme_light()
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_density() +
labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of log(cova_il_nc_water)')
vec <- dataset$log_cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of log(cova_il_nc_water)')
shapiro.test(dataset[sample(nrow(dataset), 5000), 'log_cova_il_nc_water'])
##
## Shapiro-Wilk normality test
##
## data: dataset[sample(nrow(dataset), 5000), "log_cova_il_nc_water"]
## W = 0.99235, p-value = 9.133e-16
The p-value > 0.05 implying that the distribution of the data are not significantly different from normal distribution. In other words, we can NOT assume the normality.
plot(dataset$log_cova_il_nc_water, pch=20)
plotdist(dataset$log_cova_il_nc_water, histo = TRUE, demp = TRUE)
descdist(dataset$log_cova_il_nc_water, discrete=FALSE, boot=500)
## summary statistics
## ------
## min: 4.616703 max: 13.16683
## median: 8.869405
## mean: 8.791553
## estimated sd: 1.281621
## estimated skewness: -0.3127817
## estimated kurtosis: 3.078105
fit_n <- fitdist(dataset$log_cova_il_nc_water, "norm", method="mle")
summary(fit_n)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 8.791553 0.011898529
## sd 1.281566 0.008413507
## Loglikelihood: -19339.11 AIC: 38682.23 BIC: 38696.95
## Correlation matrix:
## mean sd
## mean 1 0
## sd 0 1
plot.legend <- c("Normal")
denscomp(list(fit_n), legendtext = plot.legend)
cdfcomp (list(fit_n), legendtext = plot.legend)