Data
dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
Claim Partial Dependency XGB Classification dataset exported from a Python notebook
pd_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_PartialDependency.csv", header=TRUE)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
## [1] "modeldata_id"
## [2] "systemidstart"
## [3] "systemidend"
## [4] "cal_year"
## [5] "startdate"
## [6] "enddate"
## [7] "startdatetm"
## [8] "enddatetm"
## [9] "ecy"
## [10] "log_ecy"
## [11] "policynumber"
## [12] "policy_uniqueid"
## [13] "policyterm"
## [14] "policytype"
## [15] "effectivedate"
## [16] "expirationdate"
## [17] "policystate"
## [18] "policyform"
## [19] "persistency"
## [20] "companycd"
## [21] "carriercd"
## [22] "agency_group"
## [23] "producername"
## [24] "territory"
## [25] "risknumber"
## [26] "risktype"
## [27] "yearbuilt"
## [28] "log_yearbuilt"
## [29] "sqft"
## [30] "log_sqft"
## [31] "stories"
## [32] "roofcd"
## [33] "roofcd_encd"
## [34] "units"
## [35] "occupancycd"
## [36] "occupancy_encd"
## [37] "allperilded"
## [38] "waterded"
## [39] "protectionclass"
## [40] "constructioncd"
## [41] "constructioncd_encd"
## [42] "fire_risk_model_score"
## [43] "multipolicyind"
## [44] "multipolicyindumbrella"
## [45] "earthquakeumbrellaind"
## [46] "usagetype"
## [47] "usagetype_encd"
## [48] "ordinanceorlawpct"
## [49] "functionalreplacementcost"
## [50] "homegardcreditind"
## [51] "sprinklersystem"
## [52] "landlordind"
## [53] "rentersinsurance"
## [54] "firealarmtype"
## [55] "burglaryalarmtype"
## [56] "waterdetectiondevice"
## [57] "neighborhoodcrimewatchind"
## [58] "propertymanager"
## [59] "safeguardplusind"
## [60] "kitchenfireextinguisherind"
## [61] "gatedcommunityind"
## [62] "deadboltind"
## [63] "poolind"
## [64] "replacementcostdwellingind"
## [65] "replacementvalueind"
## [66] "serviceline"
## [67] "equipmentbreakdown"
## [68] "numberoffamilies"
## [69] "insuredage"
## [70] "maritalstatus"
## [71] "insurancescore"
## [72] "overriddeninsurancescore"
## [73] "insurancescorevalue"
## [74] "insscoretiervalueband"
## [75] "financialstabilitytier"
## [76] "allcov_wp"
## [77] "cova_wp"
## [78] "cova_ep"
## [79] "cova_deductible"
## [80] "log_cova_deductible"
## [81] "cova_limit"
## [82] "log_cova_limit"
## [83] "cova_ic_nc_water"
## [84] "hasclaim"
## [85] "cova_il_nc_water"
## [86] "log_cova_il_nc_water"
## [87] "water_risk_3_blk"
## [88] "log_water_risk_3_blk"
## [89] "water_risk_fre_3_blk"
## [90] "log_water_risk_fre_3_blk"
## [91] "water_risk_sev_3_blk"
## [92] "log_water_risk_sev_3_blk"
## [93] "appl_fail_3_blk"
## [94] "fixture_leak_3_blk"
## [95] "pipe_froze_3_blk"
## [96] "plumb_leak_3_blk"
## [97] "rep_cost_3_blk"
## [98] "ustructure_fail_3_blk"
## [99] "waterh_fail_3_blk"
## [100] "loaddate"
## [101] "customer_cnt_active_policies"
## [102] "customer_cnt_active_policies_binned"
## [103] "cova_ic_nc_water_color"
#dataset <- dataset[,all]
str(dataset)
## 'data.frame': 1990928 obs. of 103 variables:
## $ modeldata_id : int 534254 1793227 880091 1653118 353309 1289956 305141 1115709 2075469 1924235 ...
## $ systemidstart : int 1219556 7403270 2370713 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
## $ systemidend : int 1219556 7403270 2955483 6514641 733342 4348409 544662 3436984 8943949 8072132 ...
## $ cal_year : int 2013 2018 2014 2018 2011 2016 2012 2015 2020 2020 ...
## $ startdate : Factor w/ 4288 levels "2009-01-01","2009-01-02",..: 1462 3611 2067 3424 1030 2878 1096 2544 4092 4018 ...
## $ enddate : Factor w/ 4619 levels "2009-01-09","2009-01-12",..: 1553 3620 2159 3431 1044 2890 1225 2524 4351 4156 ...
## $ startdatetm : Factor w/ 23690 levels "2008-01-09 00:00:00",..: 6832 20119 11522 19085 5616 15737 4639 13892 22802 21286 ...
## $ enddatetm : Factor w/ 16455 levels "2009-01-09 00:00:00",..: 5395 14950 8349 12630 3383 11613 4104 10087 16261 15864 ...
## $ ecy : num 0.339 0.115 0.342 0.11 0.129 ...
## $ log_ecy : num -1.08 -2.16 -1.07 -2.21 -2.05 ...
## $ policynumber : Factor w/ 244668 levels "AZF0082147","AZF0221975",..: 33157 192754 185786 139108 2424 15149 183739 163023 99696 200055 ...
## $ policy_uniqueid : int 445056 1541369 786905 1427784 348649 1124000 292060 976554 1776548 1654555 ...
## $ policyterm : int 4 7 4 10 1 3 1 7 4 6 ...
## $ policytype : Factor w/ 2 levels "New","Renewal": 2 2 2 2 1 2 1 2 2 2 ...
## $ effectivedate : Factor w/ 4329 levels "2008-01-09","2008-01-14",..: 1264 3654 2110 3467 1073 2921 935 2587 4135 3866 ...
## $ expirationdate : Factor w/ 4450 levels "2009-01-09","2009-01-14",..: 1385 3775 2231 3588 1195 3042 1057 2709 4256 3987 ...
## $ policystate : Factor w/ 3 levels "AZ","CA","NV": 2 2 2 2 1 1 2 2 2 2 ...
## $ policyform : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 9 9 9 2 9 9 9 2 9 ...
## $ persistency : int 3 6 3 25 0 2 0 7 3 5 ...
## $ companycd : int 1 17 17 1 1 16 17 17 17 1 ...
## $ carriercd : Factor w/ 2 levels "CSEICO","CSESG": 1 2 2 1 1 1 2 2 2 1 ...
## $ agency_group : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 446 582 89 50 265 265 378 426 426 426 ...
## $ producername : Factor w/ 1269 levels "1ST CENTURY INS SVCS INC.",..: 982 1224 142 78 452 452 738 888 947 947 ...
## $ territory : Factor w/ 10 levels "","AZ-A","AZ-T",..: 8 6 6 4 6 6 8 7 7 7 ...
## $ risknumber : int 1 1 1 1 1 1 1 1 1 1 ...
## $ risktype : Factor w/ 2 levels "Dwelling","Homeowners": 1 2 2 2 1 2 2 2 1 2 ...
## $ yearbuilt : int 1986 1986 1994 1951 1974 1997 1990 1958 2005 1984 ...
## $ log_yearbuilt : num 7.59 7.59 7.6 7.58 7.59 ...
## $ sqft : int 1500 2200 2600 1100 1200 1400 1700 1100 2700 1300 ...
## $ log_sqft : num 7.33 7.73 7.87 7.04 7.12 ...
## $ stories : int 1 2 2 1 1 1 1 1 2 2 ...
## $ roofcd : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 4 1 1 6 1 6 1 ...
## $ roofcd_encd : int 7 7 7 6 8 8 7 8 7 8 ...
## $ units : int 1 1 1 1 1 1 1 1 1 1 ...
## $ occupancycd : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ occupancy_encd : int 1 1 1 1 1 1 1 1 1 1 ...
## $ allperilded : int 500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
## $ waterded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protectionclass : int 3 2 3 3 2 2 4 3 4 4 ...
## $ constructioncd : Factor w/ 5 levels "AF","B","F","M",..: 3 1 1 3 2 5 3 3 1 1 ...
## $ constructioncd_encd : int 5 4 4 5 3 2 5 5 4 4 ...
## $ fire_risk_model_score : int 0 2 0 0 -1 -1 2 0 1 2 ...
## $ multipolicyind : int 0 0 0 1 0 0 1 0 0 0 ...
## $ multipolicyindumbrella : int 0 0 0 0 0 0 0 0 0 0 ...
## $ earthquakeumbrellaind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ usagetype : Factor w/ 7 levels "COC","PRIMARY",..: 3 2 2 2 3 2 2 2 3 2 ...
## $ usagetype_encd : int 6 7 7 7 6 7 7 7 6 7 ...
## $ ordinanceorlawpct : int 10 20 0 10 0 0 0 0 10 10 ...
## $ functionalreplacementcost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ homegardcreditind : int 0 0 0 0 0 0 0 0 0 1 ...
## $ sprinklersystem : int 0 0 1 0 0 0 0 0 0 0 ...
## $ landlordind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rentersinsurance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ firealarmtype : int 0 0 1 1 1 1 1 0 1 1 ...
## $ burglaryalarmtype : int 0 0 0 0 0 0 0 0 1 0 ...
## $ waterdetectiondevice : int 0 0 0 0 0 0 0 0 0 0 ...
## $ neighborhoodcrimewatchind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ propertymanager : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safeguardplusind : int 0 1 0 0 0 0 0 1 1 0 ...
## $ kitchenfireextinguisherind : int 1 1 1 0 0 0 0 1 0 0 ...
## $ gatedcommunityind : int 0 0 0 0 0 0 0 1 0 0 ...
## $ deadboltind : int 1 1 1 0 1 1 1 1 1 1 ...
## $ poolind : int 0 1 0 0 0 0 0 0 0 0 ...
## $ replacementcostdwellingind : int 0 0 0 0 1 0 0 0 1 0 ...
## $ replacementvalueind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ serviceline : int 0 1 0 0 0 0 0 0 0 0 ...
## $ equipmentbreakdown : int 0 1 0 0 0 0 0 0 0 0 ...
## $ numberoffamilies : int 1 1 1 1 1 1 1 1 1 1 ...
## $ insuredage : int 66 61 39 92 79 36 72 59 55 69 ...
## $ maritalstatus : Factor w/ 5 levels "~","Divorced",..: 1 3 3 1 4 4 4 1 3 3 ...
## $ insurancescore : Factor w/ 3842 levels "(DOES","~","610",..: 2 2 2 2 2 3809 2 2 2 2 ...
## $ overriddeninsurancescore : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 13 1 1 1 1 ...
## $ insurancescorevalue : Factor w/ 178 levels "~","605","626",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insscoretiervalueband : Factor w/ 24 levels "~","599-606",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ financialstabilitytier : Factor w/ 42 levels "","~","01","02",..: 1 2 1 1 1 1 1 1 2 2 ...
## $ allcov_wp : int 1029 1938 915 915 336 733 687 667 1532 712 ...
## $ cova_wp : int 1029 1342 1732 1721 282 997 973 660 1233 1645 ...
## $ cova_ep : num 349.2 154.2 592.7 188.4 36.3 ...
## $ cova_deductible : int 500 1000 2500 1000 1000 1000 1000 1000 2500 2500 ...
## $ log_cova_deductible : num 6.21 6.91 7.82 6.91 6.91 ...
## $ cova_limit : int 600000 600000 600000 400000 200000 200000 300000 300000 700000 400000 ...
## $ log_cova_limit : num 13.2 13.3 13.2 12.7 11.7 ...
## $ cova_ic_nc_water : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hasclaim : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_cova_il_nc_water : num 0 0 0 0 0 0 0 0 0 0 ...
## $ water_risk_3_blk : int 230 329 213 94 234 183 329 128 171 237 ...
## $ log_water_risk_3_blk : num 5.44 5.8 5.36 4.54 5.46 ...
## $ water_risk_fre_3_blk : int 152 218 134 53 256 226 218 83 159 180 ...
## $ log_water_risk_fre_3_blk : num 5.02 5.38 4.9 3.97 5.55 ...
## $ water_risk_sev_3_blk : int 156 155 163 182 94 83 155 159 110 136 ...
## $ log_water_risk_sev_3_blk : num 5.05 5.04 5.09 5.2 4.54 ...
## $ appl_fail_3_blk : int 5 4 1 5 5 5 5 2 5 5 ...
## $ fixture_leak_3_blk : int 4 2 1 0 3 5 2 2 0 1 ...
## $ pipe_froze_3_blk : int 0 0 0 2 0 0 2 2 2 2 ...
## $ plumb_leak_3_blk : int 5 5 5 1 5 4 4 4 1 4 ...
## $ rep_cost_3_blk : int 5 5 5 5 5 4 5 5 5 5 ...
## $ ustructure_fail_3_blk : int 5 5 5 3 5 5 5 5 1 5 ...
## $ waterh_fail_3_blk : int 2 2 2 0 0 1 2 0 2 1 ...
## [list output truncated]
summary(dataset)
## modeldata_id systemidstart systemidend cal_year
## Min. : 1 Min. : 2 Min. : 3 Min. :2009
## 1st Qu.: 544975 1st Qu.:1251218 1st Qu.:1348961 1st Qu.:2013
## Median :1092584 Median :3273967 Median :3422898 Median :2016
## Mean :1089045 Mean :3880098 Mean :3980904 Mean :2015
## 3rd Qu.:1634112 3rd Qu.:6388617 3rd Qu.:6562387 3rd Qu.:2018
## Max. :2178777 Max. :9657386 Max. :9657502 Max. :2021
##
## startdate enddate startdatetm
## 2019-01-01: 113585 2019-01-01: 113556 2018-06-01 00:00:00: 1707
## 2018-01-01: 108834 2018-01-01: 108763 2017-09-01 00:00:00: 1557
## 2020-01-01: 96348 2020-01-01: 96432 2018-07-01 00:00:00: 1547
## 2017-01-01: 88488 2017-01-01: 88375 2017-07-01 00:00:00: 1508
## 2016-01-01: 76457 2016-01-01: 76443 2018-08-01 00:00:00: 1454
## 2013-01-01: 74667 2013-01-01: 74676 2018-09-01 00:00:00: 1435
## (Other) :1432549 (Other) :1432683 (Other) :1981720
## enddatetm ecy log_ecy
## 2019-06-01 00:00:00: 2013 Min. :0.0027 Min. :-5.914504
## 2018-06-01 00:00:00: 1893 1st Qu.:0.2464 1st Qu.:-1.400799
## 2019-07-01 00:00:00: 1887 Median :0.4709 Median :-0.753110
## 2019-08-01 00:00:00: 1842 Mean :0.4794 Mean :-1.020831
## 2018-07-01 00:00:00: 1829 3rd Qu.:0.7091 3rd Qu.:-0.343759
## 2018-09-01 00:00:00: 1802 Max. :1.0020 Max. : 0.001998
## (Other) :1979662
## policynumber policy_uniqueid policyterm policytype
## CAF0389924: 201 Min. : 1 Min. : 1.000 New : 474221
## CAF0461789: 192 1st Qu.: 493706 1st Qu.: 2.000 Renewal:1516707
## CAF0475516: 168 Median : 957578 Median : 3.000
## CAF0464778: 144 Mean : 944610 Mean : 3.799
## CAF0393082: 130 3rd Qu.:1407795 3rd Qu.: 5.000
## CAF0468201: 120 Max. :1854593 Max. :13.000
## (Other) :1989973
## effectivedate expirationdate policystate
## 2018-06-01: 2058 2019-06-01: 2058 AZ: 148332
## 2017-07-01: 1945 2018-07-01: 1943 CA:1775912
## 2017-09-01: 1923 2018-09-01: 1923 NV: 66684
## 2018-07-01: 1914 2019-07-01: 1913
## 2017-06-01: 1901 2018-06-01: 1904
## 2018-08-01: 1898 2019-08-01: 1892
## (Other) :1979289 (Other) :1979295
## policyform persistency companycd carriercd
## HO3 :1011501 Min. : 0.000 Min. : 1.00 CSEICO: 687722
## DF3 : 813757 1st Qu.: 1.000 1st Qu.: 1.00 CSESG :1303206
## DF6 : 84023 Median : 3.000 Median :17.00
## Form3 : 39639 Mean : 6.081 Mean :12.08
## FL1-Vacant : 22661 3rd Qu.: 8.000 3rd Qu.:17.00
## FL3-Special: 10314 Max. :103.000 Max. :19.00
## (Other) : 9033
## agency_group
## WESTERN GOLD INS AGCY INC. : 235856
## J.E. BROWN and ASSOCS INS SVCS : 87275
## PIIB - PACIFIC INTERSTATE INS : 63703
## CRUSBERG DECKER INS SVCS INC : 61736
## ISU INSURANCE SERVICES OF SAN FRANCISCO INC: 53588
## Acrisure of California : 47783
## (Other) :1440987
## producername territory
## WESTERN GOLD INS AGCY INC. : 234240 CA-B :602884
## J.E. BROWN and ASSOCS INS SVCS: 82834 CA-C :410013
## CRUSBERG DECKER INS SVCS INC : 40881 CA-O :349951
## TOMM and BUCK INSURANCE SRVCS : 23599 CA-A :301284
## BICHLMEIER INSURANCE SRVS INC : 23396 AZ-A :131032
## VALLEY WEST FINANCIAL INS SVC : 22406 CA-T :125223
## (Other) :1563572 (Other): 70541
## risknumber risktype yearbuilt log_yearbuilt
## Min. : 0.000 Dwelling : 939788 Min. :1900 Min. :7.523
## 1st Qu.: 1.000 Homeowners:1051140 1st Qu.:1959 1st Qu.:7.580
## Median : 1.000 Median :1979 Median :7.590
## Mean : 1.006 Mean :1976 Mean :7.589
## 3rd Qu.: 1.000 3rd Qu.:1996 3rd Qu.:7.599
## Max. :16.000 Max. :2019 Max. :7.611
##
## sqft log_sqft stories roofcd
## Min. : 800 Min. :6.397 Min. :1.000 COMPO :997908
## 1st Qu.:1300 1st Qu.:7.177 1st Qu.:1.000 MEMBRANE: 23403
## Median :1700 Median :7.438 Median :1.000 METAL : 5135
## Mean :1873 Mean :7.470 Mean :1.208 OTHER :234545
## 3rd Qu.:2300 3rd Qu.:7.749 3rd Qu.:1.000 TAR : 45071
## Max. :5000 Max. :9.210 Max. :3.000 TILE :666814
## WOOD : 18052
## roofcd_encd units occupancycd occupancy_encd
## Min. :1.00 Min. :1.000 NO : 5 Min. :1.000
## 1st Qu.:7.00 1st Qu.:1.000 OCCUPIEDNOW:1856970 1st Qu.:1.000
## Median :8.00 Median :1.000 TENANT : 133953 Median :1.000
## Mean :7.23 Mean :1.125 Mean :1.067
## 3rd Qu.:8.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :8.00 Max. :4.000 Max. :3.000
##
## allperilded waterded protectionclass constructioncd
## Min. : 0 Min. : 0.00 Min. : 0.000 AF : 801773
## 1st Qu.: 1000 1st Qu.: 0.00 1st Qu.: 2.000 B : 22461
## Median : 1000 Median : 0.00 Median : 3.000 F :1121154
## Mean : 1403 Mean : 94.63 Mean : 2.947 M : 22588
## 3rd Qu.: 2500 3rd Qu.: 0.00 3rd Qu.: 4.000 OTHER: 22952
## Max. :10000 Max. :10000.00 Max. :10.000
##
## constructioncd_encd fire_risk_model_score multipolicyind
## Min. :1.000 Min. :-1.0000 Min. :0.0000
## 1st Qu.:4.000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :5.000 Median : 0.0000 Median :0.0000
## Mean :4.495 Mean : 0.2137 Mean :0.1617
## 3rd Qu.:5.000 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :5.000 Max. :18.0000 Max. :1.0000
##
## multipolicyindumbrella earthquakeumbrellaind usagetype
## Min. :0.000000 Min. :0.000000 COC : 11503
## 1st Qu.:0.000000 1st Qu.:0.000000 PRIMARY :1107959
## Median :0.000000 Median :0.000000 RENTAL : 848301
## Mean :0.005191 Mean :0.004436 SEASONAL : 8397
## 3rd Qu.:0.000000 3rd Qu.:0.000000 SECONDARY : 3601
## Max. :1.000000 Max. :1.000000 UNOCCUPIED: 321
## VACANT : 10846
## usagetype_encd ordinanceorlawpct functionalreplacementcost
## Min. :1.000 Min. : 0.000 Min. :0.000000
## 1st Qu.:6.000 1st Qu.: 0.000 1st Qu.:0.000000
## Median :7.000 Median : 10.000 Median :0.000000
## Mean :6.519 Mean : 9.887 Mean :0.001915
## 3rd Qu.:7.000 3rd Qu.: 10.000 3rd Qu.:0.000000
## Max. :7.000 Max. :100.000 Max. :1.000000
##
## homegardcreditind sprinklersystem landlordind rentersinsurance
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.000000
## Mean :0.1413 Mean :0.03199 Mean :0.08028 Mean :0.005884
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.000000
##
## firealarmtype burglaryalarmtype waterdetectiondevice
## Min. :0.000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000000
## Median :1.000 Median :0.0000 Median :0.0000000
## Mean :0.627 Mean :0.3655 Mean :0.0002401
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:0.0000000
## Max. :1.000 Max. :1.0000 Max. :1.0000000
##
## neighborhoodcrimewatchind propertymanager safeguardplusind
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.01387 Mean :0.02075 Mean :0.3691
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## kitchenfireextinguisherind gatedcommunityind deadboltind
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.3979 Mean :0.01435 Mean :0.7198
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## poolind replacementcostdwellingind replacementvalueind
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.03559 Mean :0.3573 Mean :0.0168
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## serviceline equipmentbreakdown numberoffamilies insuredage
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.: 46.00
## Median :0.0000 Median :0.0000 Median :1.000 Median : 57.00
## Mean :0.1171 Mean :0.1198 Mean :1.126 Mean : 56.41
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.: 66.00
## Max. :1.0000 Max. :1.0000 Max. :4.000 Max. :177.00
## NA's :236483
## maritalstatus insurancescore overriddeninsurancescore
## ~ :794672 ~ :1902377 ~ :1846216
## Divorced: 8151 99 : 418 99 : 49428
## Married :565124 KQXKD : 155 07 : 9664
## Single :603723 ZKKWS : 119 12 : 8023
## Widowed : 19258 KFWZV : 108 04 : 6305
## XXQKS : 101 06 : 5842
## (Other): 87650 (Other): 65450
## insurancescorevalue insscoretiervalueband financialstabilitytier
## ~ :1990011 ~ :1990011 :1740132
## 825 : 36 837-865: 107 ~ : 234605
## 873 : 20 865-880: 94 07 : 1577
## 769 : 18 748-774: 90 12 : 1337
## 881 : 15 820-837: 81 04 : 1030
## 829 : 14 894-945: 73 13 : 970
## (Other): 814 (Other): 472 (Other): 11277
## allcov_wp cova_wp cova_ep cova_deductible
## Min. : 32 Min. : 5.0 Min. :-8666.2 Min. : 0
## 1st Qu.: 548 1st Qu.: 468.0 1st Qu.: 116.4 1st Qu.: 1000
## Median : 777 Median : 755.0 Median : 283.4 Median : 1000
## Mean : 911 Mean : 973.5 Mean : 411.2 Mean : 1403
## 3rd Qu.: 1105 3rd Qu.: 1231.0 3rd Qu.: 567.6 3rd Qu.: 2500
## Max. :18926 Max. :14620.0 Max. :12121.9 Max. :10000
##
## log_cova_deductible cova_limit log_cova_limit cova_ic_nc_water
## Min. :0.000 Min. : 100000 Min. : 8.294 Min. :0.000000
## 1st Qu.:6.908 1st Qu.: 300000 1st Qu.:12.384 1st Qu.:0.000000
## Median :6.908 Median : 400000 Median :12.702 Median :0.000000
## Mean :7.018 Mean : 422080 Mean :12.689 Mean :0.005978
## 3rd Qu.:7.824 3rd Qu.: 500000 3rd Qu.:13.035 3rd Qu.:0.000000
## Max. :9.210 Max. :1300000 Max. :14.944 Max. :3.000000
##
## hasclaim cova_il_nc_water log_cova_il_nc_water
## Min. :0.000000 Min. : -5536.9 Min. :-0.10536
## 1st Qu.:0.000000 1st Qu.: 0.0 1st Qu.: 0.00000
## Median :0.000000 Median : 0.0 Median : 0.00000
## Mean :0.005844 Mean : 79.6 Mean : 0.05128
## 3rd Qu.:0.000000 3rd Qu.: 0.0 3rd Qu.: 0.00000
## Max. :1.000000 Max. :522735.2 Max. :13.16683
## NA's :3
## water_risk_3_blk log_water_risk_3_blk water_risk_fre_3_blk
## Min. : 15.0 Min. :2.708 Min. : 15.0
## 1st Qu.: 134.0 1st Qu.:4.898 1st Qu.: 108.0
## Median : 185.0 Median :5.220 Median : 154.0
## Mean : 202.6 Mean :5.202 Mean : 169.4
## 3rd Qu.: 244.0 3rd Qu.:5.497 3rd Qu.: 209.0
## Max. :1491.0 Max. :7.307 Max. :2308.0
##
## log_water_risk_fre_3_blk water_risk_sev_3_blk log_water_risk_sev_3_blk
## Min. :2.708 Min. : 33.0 Min. :3.497
## 1st Qu.:4.682 1st Qu.:106.0 1st Qu.:4.663
## Median :5.037 Median :125.0 Median :4.828
## Mean :5.014 Mean :127.7 Mean :4.821
## 3rd Qu.:5.342 3rd Qu.:147.0 3rd Qu.:4.990
## Max. :7.744 Max. :313.0 Max. :5.746
##
## appl_fail_3_blk fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:4.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.00
## Median :5.000 Median :2.000 Median :2.000 Median :4.00
## Mean :4.118 Mean :1.884 Mean :1.735 Mean :3.07
## 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
## rep_cost_3_blk ustructure_fail_3_blk waterh_fail_3_blk
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:5.000 1st Qu.:5.000 1st Qu.:0.000
## Median :5.000 Median :5.000 Median :1.000
## Mean :4.797 Mean :4.441 Mean :1.154
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000
##
## loaddate customer_cnt_active_policies
## 2020-08-07 08:09:37.497:1990928 Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.757
## 3rd Qu.: 1.000
## Max. :147.000
##
## customer_cnt_active_policies_binned cova_ic_nc_water_color
## Min. : 1.000 Min. :0.000000
## 1st Qu.: 1.000 1st Qu.:0.000000
## Median : 1.000 Median :0.000000
## Mean : 2.936 Mean :0.005978
## 3rd Qu.: 1.000 3rd Qu.:0.000000
## Max. :150.000 Max. :3.000000
##
dataset$cova_ic_nc_water_color <- as.factor(dataset$cova_ic_nc_water)
ggplot(dataset, aes(x = cova_ic_nc_water, fill=cova_ic_nc_water_color)) +
geom_bar() +
labs(x = 'Number of Claims', y = 'Count', title = 'Histogram of Number of Claims') +
scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red", "2" = "red", "3" = "red")) +
geom_text(stat='count', aes(label=..count..), vjust=1)
Since the cases when there are more then 1 claim per exposure are very rare and, I create a new logical attribute HasClaim with values 1 or 0. It can be used in logistic regression but I use it visualize if there is any visual dependency between predictors and claims.
dataset$hasclaim <- as.factor(dataset$hasclaim)
ggplot(dataset, aes(x = hasclaim, fill=hasclaim)) +
geom_bar() +
scale_fill_manual("legend", values = c("0" = "#56B4E9", "1" = "red")) +
labs(x = 'Adjusted Number of Claims', y = 'Count', title = 'Adjusted Histogram of Number of Claims') +
geom_text(stat='count', aes(label=..count..), vjust=1)
There is cery low percent of water related claims: 0.6%
(the order is from XGB Classification feature importance)
ratio <- nrow(dataset)
#grid for multiplots
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
require(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
#continuous attributes
plot_continuous <- function (col_name) {
p1 <- ggplot(dataset, aes(x = .data[[col_name]], fill=hasclaim)) +
geom_histogram(bins=100) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
labs(x = col_name, y = 'Count', title = paste("Histogram of", col_name))
#p2 <- ggplot(dataset, aes(x = .data[[col_name]])) +
#geom_density() +
# labs(x = col_name, y = 'Density', title = paste("Density of", col_name))
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
p2 <- ggplot(pd_col, aes(x = value, y = pd)) +
geom_line(aes(color="darkred")) +
labs(x = col_name, y = 'pd', title = paste("Claim Partial Dependency of", col_name))
p3 <- ggplot(dataset, aes(x=hasclaim, y=.data[[col_name]], col=hasclaim, fill=hasclaim)) +
geom_boxplot(notch = TRUE) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
theme(legend.position = "none") +
labs(y = col_name, title = paste("Box Plot of", col_name, "with hasclaim"))
vec <- dataset[[col_name]]
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
p4 <- ggplot(dataset, aes(sample = .data[[col_name]], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = col_name, title = paste("QQ Plot of", col_name))
multiplot(p1,p2,p3,p4, cols=2)
}
#categorical attributes - multiplot does not work for fun modeling
plot_categorical <- function(col_name) {
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
}
else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
}
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
}
else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
}
else {
col_name_original <- col_name
}
pd_col <- pd_dataset[pd_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
df <- merge(dataset[c(col_name,'hasclaim')],pd_col,by=col_name)
if (grepl('encd',col_name) |
col_name =='fire_risk_model_score' |
col_name =='customer_cnt_active_policies_binned' |
col_name =='cova_deductible' |
col_name =='cova_limit' |
col_name =='protectionclass' |
col_name =='ordinanceorlawpct' |
col_name =='numberoffamilies' |
col_name =='waterded' |
col_name =='units' |
col_name =='stories') {
XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)])))
}
else if (col_name =='pipe_froze_3_blk' |
col_name =='water_risk_3_blk' |
col_name =='ustructure_fail_3_blk' |
col_name =='water_risk_fre_3_blk' |
col_name =='waterh_fail_3_blk' |
col_name =='rep_cost_3_blk' |
col_name =='plumb_leak_3_blk' |
col_name =='appl_fail_3_blk' |
col_name =='fixture_leak_3_blk') {
XBreaks <- c(0,1,2,3,4,5)
XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')
}
else {
XBreaks <- c(0,1)
XLabels <- c("No", "Yes")
}
cols <- c('PD'='#f04546')
p1 = ggplot(df) +
geom_bar(aes(x = .data[[col_name]], fill=hasclaim)) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
geom_line(aes(x = .data[[col_name]], y=(pd)*ratio*100,color='PD')) +
scale_y_continuous(sec.axis = sec_axis(~./ratio, name = "PD")) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
labs(x = col_name, y = 'Count', title = paste("Histogram and claims partial dependency of", col_name_original)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
if (length(unique(dataset[[col_name_original]])) <= 6)
{p2 = bayesian_plot(data=dataset, input=col_name_original, target='hasclaim')}
p3 = cross_plot(data=dataset, str_input=col_name_original, str_target='hasclaim',plot_type='percentual')
if (length(unique(dataset[[col_name_original]])) <= 6) {
print(p1)
print(p2)
print(p3)
}
else {
print(p1)
print(p3)
}
}
plot_categorical('usagetype_encd')
## NULL
The more property is used, the higher claims rate.
plot_categorical('customer_cnt_active_policies_binned')
## NULL
If the same customer has more then 10 active policies, the claim rate is lower.
plot_continuous('ecy')
## Loading required package: grid
The longer the exposure, the higher claim rate.
plot_categorical('cova_deductible')
## NULL
The claim rate is higher in low deductible policies.
plot_continuous('yearbuilt')
More claims are in newer houses but not in modern.
plot_categorical('landlordind')
## NULL
This is a discount based on the number of policies for the same customer. It’s correlated with customer_cnt_active_policies_binned and has the same claim dependency but without details: more policies less claims rate.
plot_categorical('pipe_froze_3_blk')
## NULL
Everything indicates we have more claims in the lowest category. It may makes sense for california, where we have most insured properties in an area where low tempretures are rare but not useful for further analysis.
plot_categorical('roofcd_encd')
## NULL
Visible higher claim rate in WOOD, TILE, and maybe, TAR and OTHER
plot_continuous('sqft')
The higher sqft, the higher claim rate till some limit, where it is not increased.
plot_categorical('firealarmtype')
## NULL
Firealarmtype is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.
plot_categorical('stories')
## NULL
There is a clear evidence, the 2-stories properties have more claims. However, the attributes is broken in the database. There are a lot of empty or high values (known issues).
plot_categorical('equipmentbreakdown')
## NULL
Equipmentbreakdown is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. Adding the predictor to a model increase overfitting.
plot_categorical('cova_limit')
## NULL
More claims from more expensive properties.
plot_categorical('replacementvalueind')
## NULL
Replacementvalueind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes.
plot_categorical('propertymanager')
## NULL
Even if PropertyManager decrease number of claims according to visual analysis it is not clear from the partial dependency. Probably due to correlations.
plot_categorical('multipolicyind')
## NULL
On the one hand, there are more claims in “Yes” multipolicyind category, on the other, it’s different in the partial dependency. The predictor is not very significant in GLM
plot_categorical('poolind')
## NULL
More claims in properties with pools according to the charts and baysian comparizon but it’s different in the partial dependency. GLM results are more close to the visual and significant. The difference between “No” and “Yes” categories is very small and can be due to errors.
plot_categorical('replacementcostdwellingind')
## NULL
Replacementcostdwellingind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency.
plot_categorical('safeguardplusind')
## NULL
Safeguardplusind is not directly related to water claims. There are some evidence there is a dependency but, probably, due to correlations with other attributes. There is almost no difference between categories in the partial dependency and charts.
plot_categorical('ustructure_fail_3_blk')
## NULL
There are slight increase in the “Below Average” category partial dependency but not in visual.
plot_continuous('water_risk_3_blk')
The higher the score, the more claims according to box-plots and partial dependency.
The rest of the predictors are not very important or directly related to water claims.
plot_categorical('serviceline')
## NULL
plot_categorical('protectionclass')
## NULL
plot_continuous('water_risk_fre_3_blk')
plot_categorical('rep_cost_3_blk')
## NULL
plot_categorical('waterh_fail_3_blk')
## NULL
plot_categorical('deadboltind')
## NULL
plot_categorical('homegardcreditind')
## NULL
plot_categorical('ordinanceorlawpct')
## NULL
plot_categorical('occupancy_encd')
## NULL
plot_categorical('burglaryalarmtype')
## NULL
plot_categorical('waterded')
## NULL
plot_categorical('plumb_leak_3_blk')
## NULL
plot_categorical('appl_fail_3_blk')
## NULL
plot_categorical('numberoffamilies')
## NULL
plot_categorical('units')
## NULL
plot_continuous('water_risk_sev_3_blk')
plot_categorical('multipolicyindumbrella')
## NULL
plot_categorical('kitchenfireextinguisherind')
## NULL
plot_categorical('fixture_leak_3_blk')
## NULL
plot_categorical('constructioncd_encd')
## NULL
plot_categorical('fire_risk_model_score')
## NULL
plot_categorical('gatedcommunityind')
## NULL
plot_categorical('sprinklersystem')
## NULL
plot_categorical('neighborhoodcrimewatchind')
## NULL
plot_categorical('rentersinsurance')
## NULL
plot_categorical('earthquakeumbrellaind')
## NULL
plot_categorical('functionalreplacementcost')
## NULL
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=sqft, color=hasclaim)) +
geom_point() +
scale_color_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
labs(title = "Sqft vs YearBuilt and HasClaim")
We have more policies between 1945 and 2010 with more claims in this period.
ggplot(dataset, aes(x=roofcd, y=yearbuilt, col=hasclaim, fill=hasclaim)) +
geom_boxplot(notch = TRUE) +
#scale_color_manual(breaks = c("0","1"),
# values=c("#56B4E9","red")) +
scale_fill_manual(breaks = c("0","1"),
values=c("#56B4E9","red")) +
theme(legend.position = "none") +
labs(x = "Roof Code", title = paste("Box Plot of YearBuilt vs Roof Code"))
Tile and wood is used in more modern houses with lower rate of claims. TAR is used in older. COMPO, TAR and OTHER have more claims then other