In [5]:
import pandas as pd
import numpy as np
In [6]:
data_folder = '/home/kate/Research/Property/Data/'
In [7]:
dataset = pd.read_csv(data_folder+'property_water_claims_non_cat_fs_v5.csv', error_bad_lines=False, index_col=False)
In [8]:
featureset  = [
 'stories', 
 'units', 
 'multipolicyind', 
 'functionalreplacementcost', 
 'landlordind', 
 'burglaryalarmtype', 
 'propertymanager', 
 'gatedcommunityind', 
 'replacementcostdwellingind', 
 'equipmentbreakdown', 
 'cova_deductible', 
 'water_risk_sev_3_blk', 
 'fixture_leak_3_blk', 
 'rep_cost_3_blk', 
 'sqft', 
 'waterded', 
 'constructioncd_encd', 
 'multipolicyindumbrella', 
 'usagetype_encd', 
 'homegardcreditind', 
 'rentersinsurance', 
 'waterdetectiondevice', 
 'safeguardplusind', 
 'deadboltind', 
 'replacementvalueind', 
 'numberoffamilies', 
 'water_risk_fre_3_blk', 
 'pipe_froze_3_blk', 
 'ustructure_fail_3_blk', 
 'customer_cnt_active_policies_binned', 
 'ecy', 
 'yearbuilt', 
 'roofcd_encd', 
 'occupancy_encd', 
 'protectionclass', 
 'fire_risk_model_score', 
 'earthquakeumbrellaind', 
 'ordinanceorlawpct', 
 'sprinklersystem', 
 'firealarmtype', 
 'neighborhoodcrimewatchind', 
 'kitchenfireextinguisherind', 
 'poolind', 
 'serviceline', 
 'cova_limit', 
 'water_risk_3_blk', 
 'appl_fail_3_blk', 
 'plumb_leak_3_blk', 
 'waterh_fail_3_blk'
]
In [9]:
target_column= 'hasclaim' 
In [10]:
print('Original Size = %s'%len(dataset))
Original Size = 2306865
In [11]:
EDA_FI_dataset = dataset.drop_duplicates(featureset + [target_column])[featureset + [target_column]]
In [12]:
EDA_FI_dataset.head()
Out[12]:
stories units multipolicyind functionalreplacementcost landlordind burglaryalarmtype propertymanager gatedcommunityind replacementcostdwellingind equipmentbreakdown ... neighborhoodcrimewatchind kitchenfireextinguisherind poolind serviceline cova_limit water_risk_3_blk appl_fail_3_blk plumb_leak_3_blk waterh_fail_3_blk hasclaim
0 1 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 200000 168 5 4 2 0
1 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 200000 201 5 5 3 0
2 1 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 200000 222 5 1 3 0
3 1 1 0 0 0 0 0 0 1 0 ... 0 0 0 0 200000 233 5 4 3 0
4 1 1 0 0 0 0 0 0 1 0 ... 0 1 0 0 200000 266 5 4 3 0

5 rows × 50 columns

In [13]:
print('Size of unique values for EDA of Feature Importance = %s'%len(EDA_FI_dataset))
Size of unique values for EDA of Feature Importance = 1191707
In [14]:
EDA_FI_dataset.to_csv('%sEDA_FI_dataset.csv'%(data_folder),header=True,index=False)