forked from gcrispin57/ditech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_format.py
141 lines (112 loc) · 4.67 KB
/
feature_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
"""
A general tool for converting data from the
dictionary format to an (n x k) python list that's
ready for training an sklearn algorithm
n--no. of key-value pairs in dictonary
k--no. of features being extracted
dictionary keys are names of persons in dataset
dictionary values are dictionaries, where each
key-value pair in the dict is the name
of a feature, and its value for that person
In addition to converting a dictionary to a numpy
array, you may want to separate the labels from the
features--this is what targetFeatureSplit is for
so, if you want to have the poi label as the target,
and the features you want to use are the person's
salary and bonus, here's what you would do:
feature_list = ["poi", "salary", "bonus"]
data_array = featureFormat( data_dictionary, feature_list )
label, features = targetFeatureSplit(data_array)
the line above (targetFeatureSplit) assumes that the
label is the _first_ item in feature_list--very important
that poi is listed first!
"""
import numpy as np
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
""" convert dictionary to numpy array of features
remove_NaN = True will convert "NaN" string to 0.0
remove_all_zeroes = True will omit any data points for which
all the features you seek are 0.0
remove_any_zeroes = True will omit any data points for which
any of the features you seek are 0.0
sort_keys = True sorts keys by alphabetical order. Setting the value as
a string opens the corresponding pickle file with a preset key
order (this is used for Python 3 compatibility, and sort_keys
should be left as False for the course mini-projects).
NOTE: first feature is assumed to be 'poi' and is not checked for
removal for zero or missing values.
"""
bypassed_cnt = 0
return_list = []
print "featureFormat..."
# Key order - first branch is for Python 3 compatibility on mini-projects,
# second branch is for compatibility on final project.
if isinstance(sort_keys, str):
import pickle
keys = pickle.load(open(sort_keys, "rb"))
elif sort_keys:
keys = sorted(dictionary.keys())
else:
keys = dictionary.keys()
for key in keys:
#bypass if label value = None
if dictionary[key][features[0]] == None:
continue
tmp_list = []
for feature in features:
try:
dictionary[key][feature]
except KeyError:
print "error: key ", feature, " not present"
return
# bypass record if any feature value is None
value = dictionary[key][feature]
if value is None:
value = -1
if value=="NaN" and remove_NaN:
value = 0
tmp_list.append( float(value) )
# Logic for deciding whether or not to add the data point.
append = True
### if all features are zero and you want to remove
### data points that are all zero, do that here
if remove_all_zeroes:
append = False
for item in tmp_list:
if item != 0 and item != "NaN":
append = True
break
### if any features for a given data point are zero
### and you want to remove data points with any zeroes,
### handle that here
if remove_any_zeroes:
if 0 in tmp_list or "NaN" in tmp_list:
append = False
# do not append data point if there were any None values
# None values are represented by -1
if -1 in tmp_list:
bypassed_cnt +=1
append = False
### Append the data point if flagged for addition.
if append:
return_list.append( np.array(tmp_list) )
print "Number of records bypassed %d" % bypassed_cnt
return np.array(return_list)
def targetFeatureSplit( data ):
"""
given a numpy array like the one returned from
featureFormat, separate out the first feature
and put it into its own list (this should be the
quantity you want to predict)
return targets and features as separate lists
(sklearn can generally handle both lists and numpy arrays as
input formats when training/predicting)
"""
print "targetFeatureSplit..."
target = []
features = []
for item in data:
target.append( item[0] )
features.append( item[1:] )
return target, features