-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocess.py
269 lines (227 loc) · 10 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import pandas as pd
import sys
from pathlib import Path
from sklearn.utils import resample
## Handler functions
# Handles wether to load the dataset from the BeatAML project or a different dataset
def load_dataset(url, project, normalization):
# Loads BeatAML data
if project.lower() == "beataml":
dataset, samples = load_dataset_beatAML(url, normalization)
elif project.lower() == "target":
dataset, samples = load_dataset_target(url, normalization)
elif project.lower() == "pd":
dataset, samples = load_dataset_pd(url)
else:
dataset, samples = load_dataset_rnaseq(url)
return dataset, samples
# Handles wether to load the labels from the BeatAML project or from a different dataset
def load_labels(url, project, drug_name):
# Loads BeatAML data
if project.lower() == "beataml":
labels = load_labels_beatAML(url, drug_name)
elif project.lower() == "target":
labels = load_labels_target(url)
elif project.lower() == "pd":
labels = load_labels_pd(url)
else:
labels = load_labels_rnaseq(url)
return labels
# Matches the samples from the dataset and labels, gets rid of any samples that are not available in both data matrices
def sample_match(dataset, labels, dataset_samples):
labels = labels[labels['SID'].isin(dataset_samples)]
dataset = dataset[labels['SID']]
samples = labels['SID']
return dataset, labels, samples
## Functions that change label notation
def category_to_binary(group):
if group == "high":
return 1
elif group == "low":
return 0
else:
return -1
def group_to_bool(group):
if group == "Positive":
return True
elif group == "Negative":
return False
else:
return -1
def bool_to_group(bool):
if bool == True:
return "Positive"
elif bool == False:
return "Negative"
else:
return -1
def group_to_binary(group):
if group == "Group 1" or group == 1:
return 0
elif group == "Group 2" or group == 2:
return 1
else:
return -1
def binary_to_group(binary):
if binary == 0:
return "Group 1"
elif binary == 1:
return "Group 2"
else:
return "Unknown"
def bool_to_binary(bool):
if bool == True:
return 0
elif bool == False:
return 1
else:
return -1
def auc_to_binary(value, q1, q3):
if value >= q3:
return 1
elif value <= q1:
return 0
else:
return -1
def vital_to_binary(value):
if value == "Alive":
return 1
else:
return 0
### PROJECT DATASETS
## Loads the RNA Sequence Data Matrix from the BeatAML Project
def load_dataset_beatAML(url, normalization):
if normalization == "cpm":
#dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S9-Gene Counts CPM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str})
dataset = pd.read_csv(url + "read_count_matrix.txt", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, sep="\t")
elif normalization == "rpkm":
dataset = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S8-Gene Counts RPKM", dtype = 'float64', converters = {'Gene': str, 'Symbol': str}, engine="openpyxl")
else:
sys.exit("ERROR BeatAML Project: Dataset requested not available. List of available datasets are ['cpm', 'rpkm']")
# Sets the gene ID as the index for the data matrix rows. THESE GENE ROWS ARE THE FEATURES
# Makes selection/manipulation by features easier
dataset = dataset.set_index('Gene')
# Drops symbol column since gene ID is already being used to track back
dataset = dataset.drop('Symbol', axis = 1)
# Gets the list of sample IDs from the dataset
dataset.columns = [s.replace('X','-') for s in dataset.columns]
samples = dataset.columns
return dataset, samples
## Loads the corresponding high responder/low responder labels for "drug_name" from the BeatAML Project
def load_labels_beatAML(url, drug_name):
labels = pd.read_excel(url + "variants_BeatAML.xlsx", sheet_name="Table S10-Drug Responses", usecols = ['inhibitor', 'lab_id', 'auc', 'counts'], engine="openpyxl")
# Gets rid of any drugs that was tested on less than 300 samples
labels = labels[labels['counts'] > 300]
labels = labels.drop('counts', axis = 1)
# Modifies the drug names so that only the first name is used (Gets rid of everything that's inside the parenthesis)
# This makes it easier for performing operations based on drug names and saving results
labels['inhibitor'] = labels['inhibitor'].apply(lambda x: x.split(' ')[0])
# Checks if "drug_name" exists in the dataset
if labels['inhibitor'].str.contains(drug_name).any():
# Selects the "drug_name" drug data
labels = labels[labels['inhibitor'] == drug_name]
labels = labels[['lab_id', 'auc']]
# Calculates the 1st and 3rd quantile of the AUC distribution for "drug_name"
q1 = labels['auc'].quantile(.25)
q3 = labels['auc'].quantile(.75)
# Assigns classification group to each sample:
# If the auc score <= q1, then the sample is classified as a "low responder" or "0"
# if auc score >= q3, then the sample is classified as a "high responder" or "1"
# anything else is classified as -1 (which gets removed later)
labels['GROUP'] = labels['auc'].apply(lambda x: auc_to_binary(x, q1, q3))
labels = labels.drop('auc', axis = 1)
# Filters out any samples that fell inside the 1st and 3rd Quantile (Anything classified as -1)
labels = labels[labels['GROUP'].isin([0, 1])]
labels = labels.rename(columns = {'lab_id':'SID'})
else:
sys.exit("ERROR beatAML Project: Labels requested not available. List of available labels are ['UNC2025A', 'original']")
return labels
def load_dataset_target(url, normalization):
if normalization == "cpm":
dataset = pd.read_csv(url + "genesdf.txt", sep="\t")
dataset = dataset.drop("Symbol", axis = 1)
# dataset.to_csv(url + "genesdf.txt", sep="\t")
dataset = dataset.set_index('Gene')
samples = dataset.columns
elif normalization == "rpkm":
import re
dataset = pd.read_csv(url + "TARGET_NBM_AML_QuantileNormalized_RPKM.txt", sep="\t")
cols = dataset.columns
pattern = "TARGET-[0-9][0-9]-(...*)-[0-9][0-9]A-[0-9][0-9]R"
cols = {string: re.search(pattern, string)[1] for string in cols[2:]}
dataset = dataset.rename(columns=cols)
dataset = dataset.drop("gene_name", axis = 1)
dataset = dataset.set_index("gene_id")
print(dataset)
samples = dataset.columns
# sys.exit("Error message")
else:
sys.exit("ERROR BeatAML Project: Dataset requested not available. List of available datasets are ['cpm', 'rpkm']")
return dataset, samples
def load_labels_target(url):
labels = pd.read_csv(url + "target.csv")
labels['GROUP'] = labels['GROUP'].apply(lambda x: vital_to_binary(x))
print(labels)
return labels
def load_dataset_pd(url):
dataset = pd.read_csv(url + "snp_matrix.csv", sep="\t")
dataset["#CHROM-POS"] = dataset["#CHROM"].astype(str) + "-" + dataset["POS"].astype(str)
dataset.drop("#CHROM", axis = 1)
dataset.drop("POS", axis = 1)
dataset = dataset.set_index('#CHROM-POS')
samples = dataset.columns
print(dataset)
return dataset, samples
def load_labels_pd(url):
labels = pd.read_csv(url + "00-PD-TreatmentCodeTable-ALL153.csv", usecols = ["SID", "GROUP"])
labels['GROUP'] = labels['GROUP'].apply(lambda x: group_to_binary(x))
labels['SID'] = labels['SID'].apply(lambda x: x.replace('.','-'))
print(labels)
return labels
# Creates new directory and subdirectories if given a path and the directory does not exist
# Used extensively to save results
def make_result_dir(path):
Path(path).mkdir(parents=True, exist_ok=True)
#### !!!! CAN BE MODIFIED TO FIT YOUR OWN DATASET !!!! ####
## Function to load new dataset
def load_dataset_rnaseq(url):
dataset = pd.read_csv(url, sep='\t', index_col=0)
samples = dataset.columns
return dataset, samples
## Function to load new labels
def load_labels_rnaseq(url):
labels = pd.read_csv(url, sep='\t', index_col=0)
return labels
def simulate_data(dataset, labels, simulation_size):
dataset_size = len(dataset.columns)
extra_samples_size = simulation_size - dataset_size
if(extra_samples_size < 0):
sys.exit("Requested simulation of data that's smaller than sample size, please change the simulation size")
extra_samples = labels.groupby("GROUP").sample(n = int(extra_samples_size / 2), random_state=1, replace = True)
sampled_dataset = dataset[extra_samples["SID"]]
col_names = ["simulated_sample_" + str(i) for i in range(0, extra_samples_size)]
extra_samples["SID"] = col_names
sampled_dataset.columns = col_names
dataset = pd.concat([dataset, sampled_dataset], axis=1)
labels = pd.concat([labels, extra_samples], axis=0)
return dataset, labels, dataset.columns
def balance_dataset(X_imbalanced, y_imbalanced):
df = X_imbalanced.T
df["GROUP"] = y_imbalanced.set_index("SID")
df_majority = df[df.GROUP==1]
df_minority = df[df.GROUP==0]
# Downsample majority class
df_majority_downsampled = resample(df_majority,
replace=False, # sample without replacement
n_samples=len(df_minority), # to match minority class
random_state=123) # reproducible results
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
# Display new class counts
print(df_downsampled.GROUP.value_counts())
y_balanced = df_downsampled["GROUP"]
y_balanced = y_balanced.reset_index()
y_balanced = y_balanced.rename(columns={"index": "SID"})
X_balanced = df_downsampled.drop('GROUP', axis=1).T
print(y_balanced)
return X_balanced, y_balanced, X_balanced.columns