-
Notifications
You must be signed in to change notification settings - Fork 0
/
rna_to_protein.py
287 lines (242 loc) · 14 KB
/
rna_to_protein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import argparse
import os
import sys
import pandas as pd
import data_processor
import modeler
import utils
import config as configuration
import json
#
# prepare --config /Users/c/Documents/umb/workspace/cs696/config.json
# extract --config /Users/c/Documents/umb/workspace/cs696/config.json --output_dir output --ids "NP_892006.3" "NP_958782.1"
#
# This is the starting point of the data preparation and processing
# for the RNA prediction application.
#
def main():
"""
:return:
"""
parser = argparse.ArgumentParser(prog='RNA Predictor',
description='Prepare, Extract and Process RNA data',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--config',
required=True,
help='path to program config file')
parser.add_argument('action',
choices=['prepare', 'extract', 'model', 'prep-single-cell',
'predict-single-cell', 'calc-normalization-factor'],
help='action to perform\n'
' prepare: process and prepare the raw proteomic and RNA data for processing\n'
' extract: extract samples with non-zero rna values for a set of protein ids\n'
' model: make and test prediction models based on prepared protein and rna counts\n'
' calc-normalization-factor: calculate and save the single cell normalization factor\n'
' prep-single-cell: prepare the single cell rna counts to for predicting protein ids\n'
' predict-single-cell: predict the value of a specific protein for a specific cell\n')
parser.add_argument('--prot_ids',
nargs='+',
help='list of protein ids to extract or predict')
parser.add_argument('--cell_ids',
nargs='+',
help='list of cell ids to predict protein vallues')
parser.add_argument('--output_dir',
nargs='?', help='directory where the results should be saved')
args = parser.parse_args()
# load the config json file, which contains references to the data
if args.config is not None:
with open(args.config) as config_file:
config_map = json.load(config_file)
config = configuration.Config(config_map)
action = args.action
#
# calculate the normalization factor and write the value in a file to be used later
# for prediction. This value ise used to normalize the rna values for single cells
# to be comparable to RNA values from the proteomic data.
#
if action == 'calc-normalization-factor':
with (open(config.cell_rna_total_count_file(), 'r') as cell_rna_total_count_map_file):
cell_rna_total_count_map = json.load(cell_rna_total_count_map_file)
normalization_factor = data_processor.find_normalization_factor(
config.get_prepared_rna_count_file(), cell_rna_total_count_map)
print(f'Average total normalization factor: {normalization_factor}')
stamped_out_file = f'{config.get_normalization_fact_file()}_{utils.get_date_time_str()}'
with open(config.get_normalization_fact_file(), 'w') as stamped_output, \
open(stamped_out_file, 'w') as latest_output:
stamped_output.write(str(normalization_factor))
latest_output.write(str(normalization_factor))
#
# This is a preprocessing step where the relevant data are extracted from
# the original files, aggregated into a single and patient ids are
# reconciled from the protein and rna files.
#
# This step is required before any other action.
#
elif action == 'prepare':
print(f'Preparing all data for processing ...')
print(f'protemoic: {config.get_proteomic_data_dir()}')
if not os.path.exists(config.get_proteomic_data_dir()):
print(f'Unzipping proteomic data file {config.get_proteomic_zip_file()} ...')
utils.unzip(config.get_proteomic_zip_file(), config.get_proteomic_data_dir())
else:
print(f'Found proteomic data file {config.get_proteomic_zip_file()} ...')
prot_df, rna_df, total_cnt, na_cnt = \
data_processor.prepare_data(raw_prot_data_dir=config.root_dir,
prepared_data_dir=config.get_prepared_data_dir(),
rna_counts_file=config.get_rna_counts_file(),
prepared_rna_counts_file=config.get_prepared_rna_count_file(),
prepared_prot_counts_file=config.get_prepared_prot_count_file())
print(f"Across all files: total Count: {total_cnt}, NA Count: {na_cnt if na_cnt >= 0 else '<not counted>'}")
#
# Extract the RNA data for the specified list of proteins and save the results.
# Only RNA samples that have non-zero values for all RNA counts are saved, that
# is, samples that have a zero count for one or more RNA are dropped from the results.
#
# A list of protein ids must be provided for this action. Protein ides are stored
# in a file called prot.csv and sample rna counts for the specified proteins are
# stored in a file called rna.csv. both files are stored under the folder specified
# with the --output_dir option under the program's "generated" folder. If the
# value of --output_dir is an absolute path, the "generated" folder is ignored and
# the results are stored under the specified folder.
# Note that you must run the "prepare" command before running this command.
#
elif action == 'extract':
# if args.prot_ids is None or args.output_dir is None:
if args.prot_ids is None or args.output_dir is None:
print(f'Usage: You must provide a list of protein ids and output directory with the extract command')
parser.print_usage()
exit(1)
# Columns names have been normalized during the preparation step,
# so we don't have to worry about it here
# load the prepared data into dataframes
df_prot_counts = pd.read_csv(config.get_prepared_prot_count_file(), sep=configuration.comma, header='infer')
df_rna_counts = pd.read_csv(config.get_prepared_rna_count_file(), sep=configuration.comma)
# extract the columns from the rna files that have no zeros in their values
df_rna_counts_no_zero = df_rna_counts[df_rna_counts.iloc[:, 1] > 100]
orig_sample_counts = df_rna_counts.shape[0]
non_zero_sample_count = df_rna_counts_no_zero.shape[0]
print(f'Total samples: {orig_sample_counts}')
print(f'Eliminated samples: {orig_sample_counts - non_zero_sample_count}')
print(f'Extracted samples: {non_zero_sample_count}')
# find the common columns between the rna-non-zero-df and prot_df
common_col = list(set(df_rna_counts_no_zero.columns).intersection(df_prot_counts.columns))
# add back the first column name into the columns list
filtered_prot_cols = list(common_col)
filtered_prot_cols.insert(0, df_prot_counts.columns[0])
filtered_rna_cols = list(common_col)
filtered_rna_cols.insert(0, df_rna_counts.columns[0])
filtered_prot_df = data_processor.extract_columns(df_prot_counts, filtered_prot_cols)
filtered_rna_df = data_processor.extract_columns(df_rna_counts, filtered_rna_cols)
# Persist the extracted data
extract_output_dir = os.path.join(config.get_extract_data_dir(), args.output_dir)
utils.ensure_dir_exists(extract_output_dir)
prot_output_file = os.path.join(extract_output_dir, 'prot.csv')
rna_output_file = os.path.join(extract_output_dir, 'rna.csv')
print(f'writing results:\n\textracted protein counts: {prot_output_file}'
f'\n\tExtracted RNA counts: {rna_output_file}')
filtered_prot_df.to_csv(prot_output_file, header=True, index=False)
filtered_rna_df.to_csv(rna_output_file, header=True, index=False)
#
# Create a model and test for protein prediction for a set of protein ids (specified
# on the command line, or all protein ids in the protein count file. Results are
# stored in a file. Models are not saved.
# Note that running this action for all proteins will take several hours.
#
elif action == 'model':
prot_ids = args.prot_ids if args.prot_ids is not None and len(args.prot_ids) > 0 else None
results_file = modeler.test_models(config.get_prepared_prot_count_file(),
config.get_prepared_rna_count_file(),
config.get_model_test_results_dir(),
protein_ids=prot_ids)
print(f'Saved results in: {results_file}')
#
# Prepare the single cell data which can be used for predicting protein values
# based on RNA values measured for single cells. The result of this action is
# a json file containing a map whose keys are single cell ids and whose values
# are maps containing RNA IDs as keys and RNA counts as values.
# This file can then be used to create a model for a particular cell, based on
# proteomic data, and used for predicting the count for a specific protein.
# The model will be for a specific cell and a specific protein.
#
elif action == 'prep-single-cell':
cell_rna_counts_map, non_matching_rna_names, cell_rna_total_count_map = \
data_processor.extract_single_cell_non_zero(
var_map_file=config.get_var_map(),
cell_rna_id_counts_file=config.get_single_cell_rna_file(),
normalization_method=config.get_normalization_method())
# rna_counts_file=config.get_prepared_rna_count_file())
# utils.log_normalize_single_cells(cell_rna_total_count_map)
with open(config.get_single_cell_rna_counts_file(), 'w', encoding='UTF-8') as rna_counts_json, \
open(config.get_single_cell_no_match_rna_file(), 'w', encoding='UTF-8') as non_matching_rna_json, \
open(config.cell_rna_total_count_file(), 'w', encoding='UTF-8') as single_cell_total_count_json:
rna_counts_json.write(json.dumps(cell_rna_counts_map, indent=2))
non_matching_rna_json.write(json.dumps(list(non_matching_rna_names), indent=2))
single_cell_total_count_json.write(json.dumps(cell_rna_total_count_map, indent=2))
#
# This command uses the RNA values for a single cell to predict the protein values for
# the same cell. This command may be invoked for one or more proteins. For each protein
# the program creates a model based on common RNA values with the RNA values from the
# proteomic data set and uses that model for prediction.
#
elif action == 'predict-single-cell':
if args.prot_ids is None:
print(f'You must provide at least one protein id by using --prot_ids argument with this command')
exit(2)
try:
normalization_fact_str = utils.read_line_from_file(config.get_normalization_fact_file(), 1)
except FileNotFoundError:
print(f'Normalization file "{config.get_normalization_fact_file()}" not found')
sys.exit(1)
if normalization_fact_str is None or len(normalization_fact_str) == 0:
print(f'Normalization factor extracted from "{config.get_normalization_fact_file()}" is invalid')
sys.exit(1)
try:
normalization_fact = float(normalization_fact_str)
except ValueError:
print(f'Normalization factor "{normalization_fact_str}" extracted from '
f'"{config.get_normalization_fact_file()}" is invalid')
sys.exit(1)
prot_ids = args.prot_ids
cell_ids = args.cell_ids if args.cell_ids is not None and len(args.cell_ids) > 0 else None
# this is a map of maps to hold the results of multiple protein prediction and multiple cell ids
# {
# protein_id_1 -> {
# cell_1: val_1
# ...
# cell_n: val_n
# },
# ...
# protein_id_m -> {
# cell_1: val_1
# ...
# cell_n: val_n
# }
#
predicted_cell_protein_count_map = {}
# predict one protein at a time and save the results in the prediction map
for prot_id in prot_ids:
# the return value of this method is a map of cell id to model results,
# which contains the predicted value for each cell-protein pair,
single_cell_rna_count_map = data_processor.load_single_cell_counts(config.get_single_cell_rna_counts_file())
cell_id_model_results_map = modeler.predict_prot_for_single_cell_with_df(
prot_df=pd.read_csv(config.get_prepared_prot_count_file()),
rna_df=pd.read_csv(config.get_prepared_rna_count_file()),
prot_id=prot_id,
single_cell_rna_count_map=single_cell_rna_count_map,
normalization_fact=normalization_fact,
cell_ids=cell_ids)
predicted_cell_protein_count_map[prot_id] = cell_id_model_results_map
# if an output directory is mentioned, save the results
if args.output_dir is not None:
output_dir = config.get_single_cell_prediction_output_dir(args.output_dir)
utils.ensure_dir_exists(output_dir)
output_file = os.path.join(output_dir, 'single_cell_prediction_results.json')
with open(output_file, 'w', encoding='UTF-8') as prediction_json:
prediction_json.write(json.dumps(predicted_cell_protein_count_map, indent=2))
print(f'Prediction results saved in {output_file}')
else:
print(predicted_cell_protein_count_map)
if __name__ == "__main__":
"""
"""
main()