forked from Fematich/mlengine-boilerplate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
183 lines (137 loc) · 5.21 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/python
import argparse
import logging
import os
import sys
from datetime import datetime
import apache_beam as beam
from apache_beam.io import filesystem
from apache_beam.io import tfrecordio
from apache_beam.metrics import Metrics
from tensorflow_transform import coders
from trainer.config import BUCKET, DATA_DIR, PROJECT_ID, TFRECORD_DIR
from trainer.util import schema
partition_train = Metrics.counter('partition', 'train')
partition_validation = Metrics.counter('partition', 'validation')
partition_test = Metrics.counter('partition', 'test')
examples_failed = Metrics.counter('build', 'failed')
def build_example(raw_in):
"""Build a dictionary that contains all the features and label to store
as TFRecord
Args:
raw_in: raw data to build the example from
Returns:
dict: A dictionary of features
"""
try:
elements = raw_in.split(',')
key = elements[0]
label = float(elements[1])
feat = [float(el) for el in elements[2:]]
features = {
'id': key,
'label': label,
'feat': feat,
}
yield features
except Exception as e:
examples_failed.inc()
logging.error(e, exc_info=True)
pass
def partition_fn(example, num_partitions):
"""Deterministic partition function that partitions examples based on
hashing.
Args:
example (dict): a dictionary with at least one key id
num_partitions: number of partitions, unused but enforced parameter
Returns:
int: an integer representing the partition in which the
example is put (based on the key id)
"""
distribution = [80, 10, 10]
bucket = hash(str(example['id'])) % sum(distribution)
if bucket < distribution[0]:
partition_train.inc()
return 0
elif bucket < distribution[0] + distribution[1]:
partition_validation.inc()
return 1
else:
partition_test.inc()
return 2
def parse_arguments(argv):
"""Parse command line arguments
Args:
argv (list): list of command line arguments including program name
Returns:
The parsed arguments as returned by argparse.ArgumentParser
"""
parser = argparse.ArgumentParser(description='Runs Preprocessing.')
parser.add_argument('--project_id',
default=PROJECT_ID,
help='The project to which the job will be submitted.')
parser.add_argument('--cloud',
action='store_true',
help='Run preprocessing on the cloud.')
parser.add_argument('--output_dir',
default=BUCKET,
help=('Google Cloud Storage or Local directory in '
'which to place outputs.'))
args, _ = parser.parse_known_args(args=argv[1:])
return args
def get_cloud_pipeline_options(project, output_dir):
"""Get apache beam pipeline options to run with Dataflow on the cloud
Args:
project (str): GCP project to which job will be submitted
output_dir (str): GCS directory to which output will be written
Returns:
beam.pipeline.PipelineOptions
"""
logging.warning('Start running in the cloud')
options = {
'runner': 'DataflowRunner',
'job_name': ('mlengine-boilerplate-{}'.format(
datetime.now().strftime('%Y%m%d%H%M%S'))),
'staging_location': os.path.join(BUCKET, 'staging'),
'temp_location': os.path.join(BUCKET, 'tmp'),
'project': project,
'region': 'europe-west1',
'zone': 'europe-west1-d',
'autoscaling_algorithm': 'THROUGHPUT_BASED',
'save_main_session': True,
'setup_file': './setup.py',
}
return beam.pipeline.PipelineOptions(flags=[], **options)
def main(argv=None):
"""Run preprocessing as a Dataflow pipeline.
Args:
argv (list): list of arguments
"""
args = parse_arguments(sys.argv if argv is None else argv)
if args.cloud:
pipeline_options = get_cloud_pipeline_options(args.project_id,
args.output_dir)
else:
pipeline_options = None
pipeline = beam.Pipeline(options=pipeline_options)
examples = (pipeline
# | 'ReadData' >> beam.Create(open('data/test.csv')
# .readlines()[1:])
| 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '*',
skip_header_lines=1)
| 'BuildExamples' >> beam.FlatMap(build_example))
examples_split = examples | beam.Partition(partition_fn, 3)
example_dict = {
'train': examples_split[0],
'validation': examples_split[1],
'test': examples_split[2]
}
for part, examples in example_dict.items():
examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
file_path_prefix=os.path.join(TFRECORD_DIR, part + '_examples'),
compression_type=filesystem.CompressionTypes.GZIP,
coder=coders.ExampleProtoCoder(schema),
file_name_suffix='.tfrecord.gz')
pipeline.run().wait_until_finish()
if __name__ == '__main__':
main()