-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_plant_noncontaminated_training_pipeline.py
70 lines (58 loc) · 2.46 KB
/
create_plant_noncontaminated_training_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
from itertools import groupby
import string
from sequence_mixing import *
"""Read in the training/validation data"""
files = ['data/RefSeqPlantsTest/GCF_000004075.2_ASM407v2_genomic.fna',
'data/RefSeqPlantsTest/GCF_000258705.1_Coccomyxa_subellipsoidae_v2.0_genomic.fna',
'data/RefSeqPlantsTest/GCF_000493195.1_Citrus_clementina_v1.0_genomic.fna']
species = ['Mystery Plant 1', 'Mystery Plant 2', 'Mystery Plant 3']
training_point_count = 10000
data_points = {}
first_column = ['Non-contaminated']*training_point_count
"""Replace all non-A, C, G, T characters in a given sequence
with A, C, G, T, each possibility being equally likely"""
def clean_sequence(seq):
good_chars = ['A', 'C', 'G', 'T']
new_chars = []
for i in range(len(seq)):
if seq[i] not in good_chars:
r = random.random()
index = (int)(len(good_chars)*r)
if index == 4: index = 3
new_chars.append(good_chars[index])
else:
new_chars.append(seq[i])
return ''.join(new_chars)
"""Loop through each of the species"""
for i in range(len(files)):
print i
filename = files[i]
speciesname = species[i]
"""Read in the sequence for the current species, and clean by
making all letters upper case and removing any letters that are
not one of A, C, G, T"""
with open(filename) as f:
groups = groupby(f, key=lambda x: not x.startswith(">"))
d = {}
for k,v in groups:
if not k:
key, val = list(v)[0].rstrip(), "".join(map(str.rstrip,next(groups)[1],""))
d[key] = val
f.close()
seq_fragments = [d[key] for key in d.keys() if ('chloroplast' not in key and 'mitochondria' not in key)]
seq = ''.join(seq_fragments)
seq = seq.upper()
seq = seq.strip()
good_chars = ['A', 'C', 'G', 'T']
seq = seq.replace('N', '')
"""Choose a random subsequence of the appropriate length (don't contaminate)"""
non_contaminated_seqs = []
for j in range(training_point_count):
non_contaminated_seq = random_mixed_sequence(seq, '', mixed_len = 500)
non_contaminated_seq = clean_sequence(non_contaminated_seq)
non_contaminated_seqs.append(non_contaminated_seq)
data_points[speciesname] = non_contaminated_seqs
"""Write everything out to a file"""
non_contaminated_seqs_df = pd.DataFrame(data_points, index = first_column)
non_contaminated_seqs_df.to_csv('non_contaminated_mystery_sequences.csv')