In [1]:
import pandas as pd
import numpy as np
In [2]:
data_folder = '/home/kate/Research/Property/Data/'
In [3]:
dataset = pd.read_csv(data_folder+'property_water_claims_non_cat_fs_v5.csv', error_bad_lines=False, index_col=False)
In [4]:
dataset = dataset[dataset.cova_il_nc_water >=100].copy()
dataset = dataset[dataset.cova_il_nc_water<dataset.cova_il_nc_water.quantile(0.95)].copy()
In [5]:
featureset  = [
 'stories', 
 'units', 
 'multipolicyind', 
 'functionalreplacementcost', 
 'landlordind', 
 'burglaryalarmtype', 
 'propertymanager', 
 'gatedcommunityind', 
 'replacementcostdwellingind', 
 'equipmentbreakdown', 
 'cova_deductible', 
 'water_risk_sev_3_blk', 
 'fixture_leak_3_blk', 
 'rep_cost_3_blk', 
 'sqft', 
 'waterded', 
 'constructioncd_encd', 
 'multipolicyindumbrella', 
 'usagetype_encd', 
 'homegardcreditind', 
 'rentersinsurance', 
 'waterdetectiondevice', 
 'safeguardplusind', 
 'deadboltind', 
 'replacementvalueind', 
 'numberoffamilies', 
 'water_risk_fre_3_blk', 
 'pipe_froze_3_blk', 
 'ustructure_fail_3_blk', 
 'customer_cnt_active_policies_binned', 
 'ecy', 
 'yearbuilt', 
 'roofcd_encd', 
 'occupancy_encd', 
 'protectionclass', 
 'fire_risk_model_score', 
 'earthquakeumbrellaind', 
 'ordinanceorlawpct', 
 'sprinklersystem', 
 'firealarmtype', 
 'neighborhoodcrimewatchind', 
 'kitchenfireextinguisherind', 
 'poolind', 
 'serviceline', 
 'cova_limit', 
 'water_risk_3_blk', 
 'appl_fail_3_blk', 
 'plumb_leak_3_blk', 
 'waterh_fail_3_blk'
]
In [6]:
target_column= 'cova_il_nc_water' 
In [7]:
print('Original Size = %s'%len(dataset))
Original Size = 13384
In [8]:
EDA_FI_dataset = dataset.drop_duplicates(featureset + [target_column])[featureset + [target_column]]
In [9]:
EDA_FI_dataset.head()
Out[9]:
stories units multipolicyind functionalreplacementcost landlordind burglaryalarmtype propertymanager gatedcommunityind replacementcostdwellingind equipmentbreakdown ... neighborhoodcrimewatchind kitchenfireextinguisherind poolind serviceline cova_limit water_risk_3_blk appl_fail_3_blk plumb_leak_3_blk waterh_fail_3_blk cova_il_nc_water
42 1 1 0 0 0 0 0 0 1 0 ... 0 1 0 0 300000 270 5 5 1 6298.46
316 1 1 0 0 0 1 0 0 1 0 ... 0 1 0 0 200000 238 5 5 4 5456.67
330 1 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 200000 164 5 4 4 14917.51
594 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 900000 76 3 1 0 1139.50
650 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 300000 121 5 1 0 1863.13

5 rows × 50 columns

In [10]:
print('Size of unique values for EDA of Feature Importance = %s'%len(EDA_FI_dataset))
Size of unique values for EDA of Feature Importance = 10957
In [11]:
EDA_FI_dataset.to_csv('%sEDA_Severity_FI_dataset.csv'%(data_folder),header=True,index=False)