dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
Claim Partial Dependency XGB Classification dataset exported from a Python notebook
pd_gamma_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_Severity_Gamma_PartialDependency.csv", header=TRUE)
pd_normal_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA _Severity_Normal_PartialDependency.csv", header=TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
## [1] "modeldata_id"
## [2] "systemidstart"
## [3] "systemidend"
## [4] "cal_year"
## [5] "startdate"
## [6] "enddate"
## [7] "startdatetm"
## [8] "enddatetm"
## [9] "ecy"
## [10] "log_ecy"
## [11] "policynumber"
## [12] "policy_uniqueid"
## [13] "policyterm"
## [14] "policytype"
## [15] "effectivedate"
## [16] "expirationdate"
## [17] "policystate"
## [18] "policyform"
## [19] "persistency"
## [20] "companycd"
## [21] "carriercd"
## [22] "agency_group"
## [23] "producername"
## [24] "territory"
## [25] "risknumber"
## [26] "risktype"
## [27] "yearbuilt"
## [28] "log_yearbuilt"
## [29] "sqft"
## [30] "log_sqft"
## [31] "stories"
## [32] "roofcd"
## [33] "roofcd_encd"
## [34] "units"
## [35] "occupancycd"
## [36] "occupancy_encd"
## [37] "allperilded"
## [38] "waterded"
## [39] "protectionclass"
## [40] "constructioncd"
## [41] "constructioncd_encd"
## [42] "fire_risk_model_score"
## [43] "multipolicyind"
## [44] "multipolicyindumbrella"
## [45] "earthquakeumbrellaind"
## [46] "usagetype"
## [47] "usagetype_encd"
## [48] "ordinanceorlawpct"
## [49] "functionalreplacementcost"
## [50] "homegardcreditind"
## [51] "sprinklersystem"
## [52] "landlordind"
## [53] "rentersinsurance"
## [54] "firealarmtype"
## [55] "burglaryalarmtype"
## [56] "waterdetectiondevice"
## [57] "neighborhoodcrimewatchind"
## [58] "propertymanager"
## [59] "safeguardplusind"
## [60] "kitchenfireextinguisherind"
## [61] "gatedcommunityind"
## [62] "deadboltind"
## [63] "poolind"
## [64] "replacementcostdwellingind"
## [65] "replacementvalueind"
## [66] "serviceline"
## [67] "equipmentbreakdown"
## [68] "numberoffamilies"
## [69] "insuredage"
## [70] "maritalstatus"
## [71] "insurancescore"
## [72] "overriddeninsurancescore"
## [73] "insurancescorevalue"
## [74] "insscoretiervalueband"
## [75] "financialstabilitytier"
## [76] "allcov_wp"
## [77] "cova_wp"
## [78] "cova_ep"
## [79] "cova_deductible"
## [80] "log_cova_deductible"
## [81] "cova_limit"
## [82] "log_cova_limit"
## [83] "cova_ic_nc_water"
## [84] "hasclaim"
## [85] "cova_il_nc_water"
## [86] "log_cova_il_nc_water"
## [87] "water_risk_3_blk"
## [88] "log_water_risk_3_blk"
## [89] "water_risk_fre_3_blk"
## [90] "log_water_risk_fre_3_blk"
## [91] "water_risk_sev_3_blk"
## [92] "log_water_risk_sev_3_blk"
## [93] "appl_fail_3_blk"
## [94] "fixture_leak_3_blk"
## [95] "pipe_froze_3_blk"
## [96] "plumb_leak_3_blk"
## [97] "rep_cost_3_blk"
## [98] "ustructure_fail_3_blk"
## [99] "waterh_fail_3_blk"
## [100] "loaddate"
## [101] "customer_cnt_active_policies"
## [102] "customer_cnt_active_policies_binned"
## [103] "cova_ic_nc_water_color"
dataset <- dataset[dataset$cova_il_nc_water>0,]
str(dataset)
## 'data.frame': 11636 obs. of 103 variables:
## $ modeldata_id : int 2023355 715490 1547393 883567 1255847 68156 449688 771338 307815 478115 ...
## $ systemidstart : int 8610943 1821627 5824311 2381417 4210440 122953 1028121 1998334 556945 1087455 ...
## $ systemidend : int 8909673 2170234 5824311 2381417 4569129 766663 1028136 1998334 684575 1087455 ...
## $ cal_year : int 2020 2014 2018 2014 2016 2011 2012 2014 2011 2012 ...
## $ startdate : Factor w/ 4288 levels "2009-01-01","2009-01-02",..: 4018 1827 3288 2023 2819 755 1096 1827 862 1101 ...
## $ enddate : Factor w/ 4619 levels "2009-01-09","2009-01-12",..: 4299 1999 3577 2159 2890 1063 1222 2119 1063 1429 ...
## $ startdatetm : Factor w/ 23690 levels "2008-01-09 00:00:00",..: 22144 9454 18011 11303 15410 3837 4617 10072 4436 6063 ...
## $ enddatetm : Factor w/ 16455 levels "2009-01-09 00:00:00",..: 16134 6990 13263 8202 11321 3549 4094 7389 3987 4910 ...
## $ ecy : num 0.86 0.561 0.881 0.463 0.285 ...
## $ log_ecy : num -0.151 -0.578 -0.126 -0.771 -1.256 ...
## $ policynumber : Factor w/ 244668 levels "AZF0082147","AZF0221975",..: 21299 160696 219105 4636 11884 179531 150899 63697 183742 163254 ...
## $ policy_uniqueid : int 1733466 629173 1336770 789985 1094957 10079 280017 696661 292070 356186 ...
## $ policyterm : int 2 5 1 1 7 1 3 1 1 4 ...
## $ policytype : Factor w/ 2 levels "New","Renewal": 2 2 1 1 2 1 2 1 1 2 ...
## $ effectivedate : Factor w/ 4329 levels "2008-01-09","2008-01-14",..: 4009 1710 3288 2066 2862 798 932 1830 905 1144 ...
## $ expirationdate : Factor w/ 4450 levels "2009-01-09","2009-01-14",..: 4130 1831 3409 2187 2983 919 1054 1951 1027 1266 ...
## $ policystate : Factor w/ 3 levels "AZ","CA","NV": 1 2 2 1 1 2 2 2 2 2 ...
## $ policyform : Factor w/ 9 levels "DF1","DF3","DF6",..: 9 9 8 2 9 9 9 2 9 9 ...
## $ persistency : int 1 5 0 0 16 0 12 0 0 3 ...
## $ companycd : int 16 17 19 1 16 17 1 17 17 17 ...
## $ carriercd : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 2 1 2 2 2 ...
## $ agency_group : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 593 426 292 265 316 378 316 426 378 437 ...
## $ producername : Factor w/ 1269 levels "1ST CENTURY INS SVCS INC.",..: 1240 888 612 452 651 738 652 947 738 967 ...
## $ territory : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 7 4 6 6 8 6 7 8 6 ...
## $ risknumber : int 1 1 1 1 1 1 1 1 1 1 ...
## $ risktype : Factor w/ 2 levels "Dwelling","Homeowners": 2 2 2 1 2 2 2 1 2 2 ...
## $ yearbuilt : int 1979 1994 1976 2006 1992 1966 1963 2001 2004 1977 ...
## $ log_yearbuilt : num 7.59 7.6 7.59 7.6 7.6 ...
## $ sqft : int 1600 2000 1300 2700 1200 1600 1300 1600 1800 3200 ...
## $ log_sqft : num 7.44 7.61 7.2 7.92 7.09 ...
## $ stories : int 2 1 1 2 1 1 1 2 1 1 ...
## $ roofcd : Factor w/ 7 levels "COMPO","MEMBRANE",..: 4 6 6 1 1 1 4 6 6 6 ...
## $ roofcd_encd : int 6 7 7 8 8 8 6 7 7 7 ...
## $ units : int 1 1 1 1 1 1 1 1 1 1 ...
## $ occupancycd : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ occupancy_encd : int 1 1 1 1 1 1 1 1 1 1 ...
## $ allperilded : int 1000 1000 2500 1000 1000 1000 500 1000 1000 1000 ...
## $ waterded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protectionclass : int 1 3 0 4 6 4 2 2 4 3 ...
## $ constructioncd : Factor w/ 5 levels "AF","B","F","M",..: 5 3 1 4 3 3 3 1 3 3 ...
## $ constructioncd_encd : int 2 5 4 1 5 5 5 4 5 5 ...
## $ fire_risk_model_score : int 0 0 0 -1 0 0 0 2 2 0 ...
## $ multipolicyind : int 1 1 0 0 1 0 0 0 0 0 ...
## $ multipolicyindumbrella : int 0 0 0 0 0 0 0 0 0 0 ...
## $ earthquakeumbrellaind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ usagetype : Factor w/ 7 levels "COC","PRIMARY",..: 2 2 2 3 2 2 2 3 2 2 ...
## $ usagetype_encd : int 7 7 7 6 7 7 7 6 7 7 ...
## $ ordinanceorlawpct : int 10 0 0 0 0 0 10 10 0 0 ...
## $ functionalreplacementcost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ homegardcreditind : int 1 0 0 0 0 0 1 0 0 0 ...
## $ sprinklersystem : int 0 0 0 0 0 0 0 0 0 0 ...
## $ landlordind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rentersinsurance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ firealarmtype : int 0 0 1 1 1 0 1 1 1 0 ...
## $ burglaryalarmtype : int 0 0 0 0 0 0 0 0 0 0 ...
## $ waterdetectiondevice : int 0 0 0 0 0 0 0 0 0 0 ...
## $ neighborhoodcrimewatchind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ propertymanager : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safeguardplusind : int 0 0 0 0 0 1 0 1 0 1 ...
## $ kitchenfireextinguisherind : int 0 1 0 0 0 0 0 1 0 1 ...
## $ gatedcommunityind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ deadboltind : int 1 1 1 1 0 0 0 1 1 1 ...
## $ poolind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ replacementcostdwellingind : int 0 0 0 1 0 0 0 0 0 0 ...
## $ replacementvalueind : int 0 0 1 0 0 0 0 0 0 0 ...
## $ serviceline : int 0 0 0 0 0 0 0 0 0 0 ...
## $ equipmentbreakdown : int 1 0 0 0 0 0 0 0 0 0 ...
## $ numberoffamilies : int 1 1 1 1 1 1 1 1 1 1 ...
## $ insuredage : int 43 39 61 50 48 40 57 45 45 57 ...
## $ maritalstatus : Factor w/ 5 levels "~","Divorced",..: 4 4 4 4 1 1 1 3 4 1 ...
## $ insurancescore : Factor w/ 3842 levels "(DOES","~","610",..: 3294 2 2 516 2 2 2 2 2 2 ...
## $ overriddeninsurancescore : Factor w/ 41 levels "~","01","02",..: 17 1 1 30 40 1 1 1 1 1 ...
## $ insurancescorevalue : Factor w/ 178 levels "~","605","626",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insscoretiervalueband : Factor w/ 24 levels "~","599-606",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ financialstabilitytier : Factor w/ 42 levels "","~","01","02",..: 18 1 1 1 1 1 1 1 1 1 ...
## $ allcov_wp : int 1096 688 1075 634 605 863 643 636 722 1207 ...
## $ cova_wp : int 1036 964 655 598 901 852 1048 475 949 1472 ...
## $ cova_ep : num 360 541 577 277 257 ...
## $ cova_deductible : int 1000 1000 2500 1000 1000 1000 500 1000 1000 1000 ...
## $ log_cova_deductible : num 6.91 6.91 7.82 6.91 6.91 ...
## $ cova_limit : int 300000 400000 300000 300000 200000 300000 300000 400000 300000 600000 ...
## $ log_cova_limit : num 12.3 12.7 12.6 12.5 11.9 ...
## $ cova_ic_nc_water : int 1 1 1 1 1 1 1 2 1 1 ...
## $ hasclaim : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cova_il_nc_water : num 1146 13187 21404 790 22400 ...
## $ log_cova_il_nc_water : num 7.04 9.49 9.97 6.67 10.02 ...
## $ water_risk_3_blk : int 165 112 593 168 190 219 136 245 212 222 ...
## $ log_water_risk_3_blk : num 5.11 4.72 6.39 5.12 5.25 ...
## $ water_risk_fre_3_blk : int 244 100 484 174 188 143 122 206 166 175 ...
## $ log_water_risk_fre_3_blk : num 5.5 4.61 6.18 5.16 5.24 ...
## $ water_risk_sev_3_blk : int 70 115 126 99 104 157 115 122 132 131 ...
## $ log_water_risk_sev_3_blk : num 4.25 4.74 4.84 4.6 4.64 ...
## $ appl_fail_3_blk : int 1 5 5 1 5 4 5 5 5 4 ...
## $ fixture_leak_3_blk : int 4 3 2 4 5 2 2 2 2 2 ...
## $ pipe_froze_3_blk : int 0 1 0 3 3 2 3 0 2 3 ...
## $ plumb_leak_3_blk : int 4 4 4 5 5 4 1 5 4 4 ...
## $ rep_cost_3_blk : int 4 1 5 1 0 5 5 5 5 5 ...
## $ ustructure_fail_3_blk : int 5 5 5 5 5 5 5 5 5 2 ...
## $ waterh_fail_3_blk : int 2 2 0 5 5 2 2 2 2 4 ...
## [list output truncated]
summary(dataset)
## modeldata_id systemidstart systemidend cal_year
## Min. : 179 Min. : 745 Min. : 745 Min. :2009
## 1st Qu.: 547915 1st Qu.:1259699 1st Qu.:1376745 1st Qu.:2013
## Median :1039204 Median :2963060 Median :3133598 Median :2016
## Mean :1029895 Mean :3542800 Mean :3679403 Mean :2015
## 3rd Qu.:1535314 3rd Qu.:5742663 3rd Qu.:5942212 3rd Qu.:2018
## Max. :2102494 Max. :9136529 Max. :9647449 Max. :2020
##
## startdate enddate startdatetm
## 2019-01-01: 860 2019-01-01: 794 2018-05-01 00:00:00: 14
## 2018-01-01: 738 2018-01-01: 635 2016-10-01 00:00:00: 13
## 2017-01-01: 573 2017-01-01: 601 2018-02-28 00:00:00: 12
## 2016-01-01: 557 2020-01-01: 577 2017-09-01 00:00:00: 11
## 2013-01-01: 494 2015-01-01: 505 2018-08-17 00:00:00: 11
## 2014-01-01: 486 2016-01-01: 499 2016-10-15 00:00:00: 10
## (Other) :7928 (Other) :8025 (Other) :11565
## enddatetm ecy log_ecy
## 2019-05-01 00:00:00: 17 Min. :0.0054 Min. :-5.221356
## 2018-08-01 00:00:00: 14 1st Qu.:0.4572 1st Qu.:-0.782634
## 2018-09-01 00:00:00: 14 Median :0.6625 Median :-0.411735
## 2019-07-01 00:00:00: 14 Mean :0.6343 Mean :-0.560624
## 2017-10-01 00:00:00: 13 3rd Qu.:0.8350 3rd Qu.:-0.180324
## 2019-02-28 00:00:00: 13 Max. :1.0020 Max. : 0.001998
## (Other) :11551
## policynumber policy_uniqueid policyterm policytype
## CAH6012203: 6 Min. : 114 Min. : 1.000 New :2704
## CAH0704671: 4 1st Qu.: 495973 1st Qu.: 2.000 Renewal:8932
## CAH0751641: 4 Median : 912645 Median : 3.000
## CAH0752785: 4 Mean : 895551 Mean : 3.818
## CAH0755516: 4 3rd Qu.:1323472 3rd Qu.: 6.000
## CAH0755561: 4 Max. :1794004 Max. :12.000
## (Other) :11610
## effectivedate expirationdate policystate policyform
## 2018-05-01: 17 2019-05-01: 17 AZ: 1076 HO3 :7714
## 2017-08-01: 15 2018-08-01: 15 CA:10127 DF3 :2961
## 2016-10-01: 14 2017-10-01: 14 NV: 433 DF6 : 563
## 2018-07-01: 14 2019-07-01: 14 Form3 : 323
## 2018-08-01: 14 2019-08-01: 14 FL3-Special: 53
## 2018-08-17: 14 2019-08-17: 14 FL2-Broad : 14
## (Other) :11548 (Other) :11548 (Other) : 8
## persistency companycd carriercd
## Min. : 0.000 Min. : 1.00 CSEICO:4629
## 1st Qu.: 1.000 1st Qu.: 1.00 CSESG :7007
## Median : 3.000 Median :17.00
## Mean : 6.188 Mean :11.51
## 3rd Qu.: 9.000 3rd Qu.:17.00
## Max. :44.000 Max. :19.00
##
## agency_group
## WESTERN GOLD INS AGCY INC. : 870
## J.E. BROWN and ASSOCS INS SVCS : 488
## CRUSBERG DECKER INS SVCS INC : 463
## Acrisure of California : 461
## ISU INSURANCE SERVICES OF SAN FRANCISCO INC: 430
## PIIB - PACIFIC INTERSTATE INS : 310
## (Other) :8614
## producername territory risknumber
## WESTERN GOLD INS AGCY INC. : 853 CA-B :3677 Min. :1.000
## J.E. BROWN and ASSOCS INS SVCS: 465 CA-C :3008 1st Qu.:1.000
## CRUSBERG DECKER INS SVCS INC : 291 CA-A :1467 Median :1.000
## Acrisure of California LLC : 247 CA-O :1233 Mean :1.001
## HUB INTERNATIONAL INS SVCS : 169 AZ-A : 929 3rd Qu.:1.000
## MULTI-STATE INS SERVICES INC : 166 CA-T : 812 Max. :2.000
## (Other) :9445 (Other): 510
## risktype yearbuilt log_yearbuilt sqft
## Dwelling :3599 Min. :1900 Min. :7.533 Min. : 800
## Homeowners:8037 1st Qu.:1972 1st Qu.:7.587 1st Qu.:1500
## Median :1987 Median :7.594 Median :1900
## Mean :1982 Mean :7.592 Mean :2086
## 3rd Qu.:1996 3rd Qu.:7.599 3rd Qu.:2500
## Max. :2019 Max. :7.610 Max. :5000
##
## log_sqft stories roofcd roofcd_encd
## Min. :6.397 Min. :1.00 COMPO :4228 Min. :1.000
## 1st Qu.:7.313 1st Qu.:1.00 MEMBRANE: 127 1st Qu.:7.000
## Median :7.576 Median :1.00 METAL : 32 Median :7.000
## Mean :7.585 Mean :1.26 OTHER :1646 Mean :7.073
## 3rd Qu.:7.862 3rd Qu.:2.00 TAR : 210 3rd Qu.:8.000
## Max. :9.051 Max. :3.00 TILE :5271 Max. :8.000
## WOOD : 122
## units occupancycd occupancy_encd allperilded
## Min. :1.000 NO : 0 Min. :1.000 Min. : 100
## 1st Qu.:1.000 OCCUPIEDNOW:11093 1st Qu.:1.000 1st Qu.: 500
## Median :1.000 TENANT : 543 Median :1.000 Median : 1000
## Mean :1.089 Mean :1.047 Mean : 1156
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.: 1000
## Max. :4.000 Max. :2.000 Max. :10000
##
## waterded protectionclass constructioncd constructioncd_encd
## Min. : 0.00 Min. : 0.000 AF :4413 Min. :1.000
## 1st Qu.: 0.00 1st Qu.: 2.000 B : 178 1st Qu.:4.000
## Median : 0.00 Median : 3.000 F :6744 Median :5.000
## Mean : 39.96 Mean : 3.073 M : 162 Mean :4.499
## 3rd Qu.: 0.00 3rd Qu.: 4.000 OTHER: 139 3rd Qu.:5.000
## Max. :10000.00 Max. :10.000 Max. :5.000
##
## fire_risk_model_score multipolicyind multipolicyindumbrella
## Min. :-1.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median : 0.0000 Median :0.0000 Median :0.000000
## Mean : 0.3019 Mean :0.1828 Mean :0.004727
## 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :12.0000 Max. :1.0000 Max. :1.000000
##
## earthquakeumbrellaind usagetype usagetype_encd ordinanceorlawpct
## Min. :0.00000 COC : 4 Min. :2.000 Min. : 0.000
## 1st Qu.:0.00000 PRIMARY :8212 1st Qu.:6.000 1st Qu.: 0.000
## Median :0.00000 RENTAL :3374 Median :7.000 Median : 10.000
## Mean :0.00318 SEASONAL : 36 Mean :6.693 Mean : 9.297
## 3rd Qu.:0.00000 SECONDARY : 8 3rd Qu.:7.000 3rd Qu.: 10.000
## Max. :1.00000 UNOCCUPIED: 0 Max. :7.000 Max. :100.000
## VACANT : 2
## functionalreplacementcost homegardcreditind sprinklersystem
## Min. :0.0000000 Min. :0.000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.000 1st Qu.:0.00000
## Median :0.0000000 Median :0.000 Median :0.00000
## Mean :0.0008594 Mean :0.208 Mean :0.03017
## 3rd Qu.:0.0000000 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :1.0000000 Max. :1.000 Max. :1.00000
##
## landlordind rentersinsurance firealarmtype burglaryalarmtype
## Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.000000 Median :1.0000 Median :0.0000
## Mean :0.03644 Mean :0.004383 Mean :0.6368 Mean :0.3304
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.0000
##
## waterdetectiondevice neighborhoodcrimewatchind propertymanager
## Min. :0.0000000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000000 Median :0.00000 Median :0.0000
## Mean :0.0001719 Mean :0.01934 Mean :0.0122
## 3rd Qu.:0.0000000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000000 Max. :1.00000 Max. :1.0000
##
## safeguardplusind kitchenfireextinguisherind gatedcommunityind
## Min. :0.0000 Min. :0.000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00000
## Median :0.0000 Median :0.000 Median :0.00000
## Mean :0.3682 Mean :0.419 Mean :0.01916
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.000 Max. :1.00000
##
## deadboltind poolind replacementcostdwellingind
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.6983 Mean :0.06033 Mean :0.2474
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## replacementvalueind serviceline equipmentbreakdown numberoffamilies
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :1.000
## Mean :0.02424 Mean :0.1282 Mean :0.1384 Mean :1.089
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :4.000
##
## insuredage maritalstatus insurancescore overriddeninsurancescore
## Min. : 0.00 ~ :4882 ~ :10927 ~ :10490
## 1st Qu.: 45.00 Divorced: 67 99 : 5 99 : 396
## Median : 55.00 Married :3375 DWKDS : 3 07 : 81
## Mean : 55.11 Single :3180 FUQQS : 3 12 : 63
## 3rd Qu.: 65.00 Widowed : 132 SSDDV : 3 13 : 50
## Max. :114.00 VVKVS : 3 14 : 48
## NA's :1086 (Other): 692 (Other): 508
## insurancescorevalue insscoretiervalueband financialstabilitytier
## ~ :11629 ~ :11629 :10945
## 641 : 1 714-731: 2 ~ : 610
## 666 : 1 640-648: 1 07 : 8
## 680 : 1 664-672: 1 14 : 8
## 729 : 1 672-682: 1 08 : 6
## 731 : 1 820-837: 1 13 : 6
## (Other): 2 (Other): 1 (Other): 53
## allcov_wp cova_wp cova_ep cova_deductible
## Min. : 48.0 Min. : 9 Min. : 1.428 Min. : 100
## 1st Qu.: 644.8 1st Qu.: 609 1st Qu.: 309.908 1st Qu.: 500
## Median : 901.0 Median : 939 Median : 553.833 Median : 1000
## Mean :1014.0 Mean :1144 Mean : 719.183 Mean : 1156
## 3rd Qu.:1228.0 3rd Qu.:1469 3rd Qu.: 940.123 3rd Qu.: 1000
## Max. :7588.0 Max. :9468 Max. :8631.976 Max. :10000
##
## log_cova_deductible cova_limit log_cova_limit cova_ic_nc_water
## Min. :4.605 Min. : 100000 Min. : 8.372 Min. :1.000
## 1st Qu.:6.215 1st Qu.: 300000 1st Qu.:12.476 1st Qu.:1.000
## Median :6.908 Median : 400000 Median :12.799 Median :1.000
## Mean :6.868 Mean : 449742 Mean :12.759 Mean :1.023
## 3rd Qu.:6.908 3rd Qu.: 500000 3rd Qu.:13.115 3rd Qu.:1.000
## Max. :9.210 Max. :1300000 Max. :14.644 Max. :3.000
##
## hasclaim cova_il_nc_water log_cova_il_nc_water water_risk_3_blk
## Min. :1 Min. : 0.9 Min. :-0.1054 Min. : 23.0
## 1st Qu.:1 1st Qu.: 2942.6 1st Qu.: 7.9870 1st Qu.: 159.0
## Median :1 Median : 7082.0 Median : 8.8653 Median : 210.0
## Mean :1 Mean : 13614.2 Mean : 8.7738 Mean : 228.5
## 3rd Qu.:1 3rd Qu.: 15967.6 3rd Qu.: 9.6783 3rd Qu.: 271.0
## Max. :1 Max. :522735.2 Max. :13.1668 Max. :1056.0
##
## log_water_risk_3_blk water_risk_fre_3_blk log_water_risk_fre_3_blk
## Min. :3.135 Min. : 21.0 Min. :3.045
## 1st Qu.:5.069 1st Qu.: 125.0 1st Qu.:4.828
## Median :5.347 Median : 175.0 Median :5.165
## Mean :5.334 Mean : 186.5 Mean :5.124
## 3rd Qu.:5.602 3rd Qu.: 223.0 3rd Qu.:5.407
## Max. :6.962 Max. :1044.0 Max. :6.951
##
## water_risk_sev_3_blk log_water_risk_sev_3_blk appl_fail_3_blk
## Min. : 35.0 Min. :3.555 Min. :0.000
## 1st Qu.:108.0 1st Qu.:4.682 1st Qu.:4.000
## Median :128.0 Median :4.852 Median :5.000
## Mean :130.4 Mean :4.843 Mean :4.257
## 3rd Qu.:150.0 3rd Qu.:5.011 3rd Qu.:5.000
## Max. :265.0 Max. :5.580 Max. :5.000
##
## fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk rep_cost_3_blk
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:5.00
## Median :2.000 Median :2.000 Median :4.000 Median :5.00
## Mean :1.903 Mean :1.568 Mean :3.359 Mean :4.82
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
## ustructure_fail_3_blk waterh_fail_3_blk loaddate
## Min. :0.00 Min. :0.000 2020-08-07 08:09:37.497:11636
## 1st Qu.:4.00 1st Qu.:0.000
## Median :5.00 Median :2.000
## Mean :4.33 Mean :1.331
## 3rd Qu.:5.00 3rd Qu.:2.000
## Max. :5.00 Max. :5.000
##
## customer_cnt_active_policies customer_cnt_active_policies_binned
## Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 1.000 Median : 1.000
## Mean : 1.247 Mean : 1.839
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :28.000 Max. :30.000
##
## cova_ic_nc_water_color
## Min. :1.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.023
## 3rd Qu.:1.000
## Max. :3.000
##
ggplot(dataset, aes(x = .data[['cova_il_nc_water']], color="red",fill="#56B4E9")) +
geom_histogram(bins=100) +
labs(x = 'cova_il_nc_water', y = 'Count', title = "Histogram of Losses (cova_il_nc_water)")+
scale_color_manual(values=c("red")) +
scale_fill_manual(values=c("#56B4E9")) +
theme(legend.position = "none")
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_density() +
labs(x = 'cova_il_nc_water', y = 'Density', title = "Density of Losses (cova_il_nc_water)")
vec <- dataset$cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'cova_il_nc_water', title = "QQ Plot of Losses (cova_il_nc_water)")
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']], color="red",fill="#56B4E9")) +
geom_histogram(bins=100) +
scale_color_manual(values=c("red")) +
scale_fill_manual(values=c("#56B4E9")) +
labs(x = 'log(cova_il_nc_water)', y = 'Count', title = "Histogram of Losses (log_cova_il_nc_water)")+
theme(legend.position = "none")
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_density() +
labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of Losses (log_cova_il_nc_water)')
vec <- dataset$log_cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of Losses (log_cova_il_nc_water)')
Looks like gamma distribution and log conversion is more close to normal.
plot_categorical <- function(col_name) {
#dataset[[col_name]] <- as.factor(dataset[[col_name]])
#ggplot(dataset, aes(x=.data[[col_name]], y=cova_il_nc_water, col=.data[[col_name]], fill=.data[[col_name]])) +
# geom_boxplot(notch=notch) +
# theme(legend.position = "none") +
# labs(x = col_name, title = paste("Box Plot of Losses and ", col_name))
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
}
else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
}
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
}
else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
}
else {
col_name_original <- col_name
}
if (grepl('encd',col_name) |
col_name =='fire_risk_model_score' |
col_name =='customer_cnt_active_policies_binned' |
col_name =='cova_deductible' |
col_name =='cova_limit' |
col_name =='protectionclass' |
col_name =='ordinanceorlawpct' |
col_name =='numberoffamilies' |
col_name =='waterded' |
col_name =='units' |
col_name =='sprinklersystem' |
col_name =='stories') {
XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)])))
}
else if (col_name =='pipe_froze_3_blk' |
col_name =='water_risk_3_blk' |
col_name =='ustructure_fail_3_blk' |
col_name =='water_risk_fre_3_blk' |
col_name =='waterh_fail_3_blk' |
col_name =='rep_cost_3_blk' |
col_name =='plumb_leak_3_blk' |
col_name =='appl_fail_3_blk' |
col_name =='fixture_leak_3_blk') {
XBreaks <- c(0,1,2,3,4,5)
XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')
}
else {
XBreaks <- c(0,1)
XLabels <- c("No", "Yes")
}
pd_col <- pd_gamma_dataset[pd_gamma_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
group_by(.dots = col_name) %>%
dplyr::summarize(n = n(),
mean = mean(cova_il_nc_water),
sd = sd(cova_il_nc_water),
se = sd / sqrt(n) #,
#ci = qt(0.975, df = n - 1) * sd / sqrt(n)
)
plotdata <- merge(plotdata,pd_col,by=col_name)
cols <- c('Mean Losses with standard error'='#f04546','Losses Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)
print(p2)
#----------------------
pd_col <- pd_normal_dataset[pd_normal_dataset$feature == col_name,][c('value','pd')]
if (nrow(pd_col)>0) {
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
group_by(.dots = col_name) %>%
dplyr::summarize(n = n(),
mean = mean(log_cova_il_nc_water),
sd = sd(log_cova_il_nc_water),
se = sd / sqrt(n) #,
#ci = qt(0.975, df = n - 1) * sd / sqrt(n)
)
plotdata <- merge(plotdata,pd_col,by=col_name)
cols <- c('Mean Losses (Log) with standard error'='#f04546','Losses (Log) Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses (Log) with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)
print(p2)
}
}
plot_categorical('functionalreplacementcost')
plot_categorical('numberoffamilies')
plot_categorical('units')
plot_categorical('stories')
plot_categorical('cova_deductible')
plot_categorical('replacementvalueind')
plot_categorical('neighborhoodcrimewatchind')
plot_categorical('fire_risk_model_score')
ggplot(data=dataset, mapping = aes(x=water_risk_sev_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_sev_3_blk")
The higest losses are between 100 and 200 water_risk_sev_3_blk. We may have not enough data for higher numbers of water_risk_sev_3_blk
plot_categorical('burglaryalarmtype')
plot_categorical('deadboltind')
plot_categorical('waterh_fail_3_blk')
plot_categorical('safeguardplusind')
plot_categorical('equipmentbreakdown')
plot_categorical('cova_limit')
ggplot(data=dataset, mapping = aes(x=water_risk_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_3_blk")
There are more higher losses around 250 water_risk_3_blk
plot_categorical('usagetype_encd')
plot_categorical('multipolicyindumbrella')
plot_categorical('constructioncd_encd')
plot_categorical('replacementcostdwellingind')
plot_categorical('roofcd_encd')
plot_categorical('gatedcommunityind')
plot_categorical('homegardcreditind')
plot_categorical('ustructure_fail_3_blk')
ggplot(data=dataset, mapping = aes(x=water_risk_fre_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_fre_3_blk")
plot_categorical('ecy')
ggplot(data=dataset, mapping = aes(x=ecy, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs ecy")
plot_categorical('appl_fail_3_blk')
plot_categorical('pipe_froze_3_blk')
plot_categorical('firealarmtype')
plot_categorical('fixture_leak_3_blk')
plot_categorical('sqft')
ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs sqft")
Severity is higher in larger properties
plot_categorical('yearbuilt')
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs YearBuilt")
Severity is higher in newer properties
plot_categorical('customer_cnt_active_policies_binned')
ggplot(data=dataset, mapping = aes(x=customer_cnt_active_policies_binned, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs customer_cnt_active_policies_binned")
plot_categorical('protectionclass')
plot_categorical('multipolicyind')
plot_categorical('rep_cost_3_blk')
plot_categorical('plumb_leak_3_blk')
plot_categorical('rentersinsurance')
plot_categorical('sprinklersystem')
plot_categorical('kitchenfireextinguisherind')
plot_categorical('poolind')
plot_categorical('landlordind')
plot_categorical('occupancy_encd')
plot_categorical('ordinanceorlawpct')
plot_categorical('waterded')
plot_categorical('serviceline')
plot_categorical('propertymanager')
plot_categorical('earthquakeumbrellaind')
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=roofcd)) +
geom_point() +
labs(title = "Losses vs YearBuilt and Roofcd Limit")
ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_limit)) +
geom_point() +
labs(title = "Losses vs Sqft and CovA Limit")