-
Notifications
You must be signed in to change notification settings - Fork 3
/
generate_data.py
173 lines (150 loc) · 7.32 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import argparse
import os
import re
import subprocess
from Bio import SeqIO, AlignIO
import graph_dataset
import train_valid_chrs
from config import get_config
def change_description_seqreq(file_path):
new_fasta = []
for record in SeqIO.parse(file_path, file_path[-5:]): # 'fasta' for FASTA file, 'fastq' for FASTQ file
des = record.description.split(",")
id = des[0][5:]
if des[1] == "forward":
strand = '+'
else:
strand = '-'
position = des[2][9:].split("-")
start = position[0]
end = position[1]
record.id = id
record.description = f'strand={strand} start={start} end={end}'
new_fasta.append(record)
SeqIO.write(new_fasta, file_path, "fasta")
def change_description_pbsim(fastq_path, maf_path, chr):
chr = int(chr[3:])
reads = {r.id: r for r in SeqIO.parse(fastq_path, 'fastq')}
# print(len(reads))
# counter = 0
for align in AlignIO.parse(maf_path, 'maf'):
ref, read_m = align
start = ref.annotations['start']
end = start + ref.annotations['size']
strand = '+' if read_m.annotations['strand'] == 1 else '-'
description = f'strand={strand} start={start} end={end} chr={chr}'
reads[read_m.id].id += f'_chr{chr}'
reads[read_m.id].name += f'_chr{chr}'
reads[read_m.id].description = description
# counter += 1
# print(counter)
fasta_path = fastq_path[:-1] + 'a'
SeqIO.write(list(reads.values()), fasta_path, 'fasta')
os.remove(fastq_path)
return fasta_path
def merge_dicts(d1, d2, d3={}):
keys = {*d1, *d2, *d3}
merged = {key: d1.get(key, 0) + d2.get(key, 0) + d3.get(key, 0) for key in keys}
return merged
def handle_pbsim_output(idx, chrN, chr_raw_path, combo=False):
if combo == True:
idx = chrN
subprocess.run(f'mv {idx}_0001.fastq {idx}.fastq', shell=True, cwd=chr_raw_path)
subprocess.run(f'mv {idx}_0001.maf {idx}.maf', shell=True, cwd=chr_raw_path)
subprocess.run(f'rm {idx}_0001.ref', shell=True, cwd=chr_raw_path)
fastq_path = os.path.join(chr_raw_path, f'{idx}.fastq')
maf_path = os.path.join(chr_raw_path, f'{idx}.maf')
print(f'Adding positions for training...')
fasta_path = change_description_pbsim(fastq_path, maf_path, chr=chrN) # Extract positional info from the MAF file
print(f'Removing the MAF file...')
subprocess.run(f'rm {idx}.maf', shell=True, cwd=chr_raw_path)
if combo:
return fasta_path
else:
return None
# 1. Simulate the sequences - HiFi
def simulate_reads_hifi(datadir_path, chrs_path, chr_dict, assembler, pbsim3_dir, sample_profile_id, sample_file_path, depth):
print(f'SETUP - simulate')
datadir_path = os.path.abspath(datadir_path)
chrs_path = os.path.abspath(chrs_path)
for chrN_flag, n_need in chr_dict.items():
if chrN_flag.endswith('_r'):
continue
if '+' in chrN_flag:
continue
elif chrN_flag.endswith('_hg002'):
chrN = chrN_flag[:-6]
chr_seq_path = os.path.join(chrs_path, f'{chrN}.fasta')
# depth = 60
else:
print('Give valid suffix!')
raise Exception
chr_raw_path = os.path.join(datadir_path, f'{chrN}/raw')
chr_processed_path = os.path.join(datadir_path, f'{chrN}/{assembler}/processed')
if not os.path.isdir(chr_raw_path):
os.makedirs(chr_raw_path)
if not os.path.isdir(chr_processed_path):
os.makedirs(chr_processed_path)
# TODO: Fix so that you can delete raw files
raw_files = {int(re.findall(r'(\d+).fast*', raw)[0]) for raw in os.listdir(chr_raw_path)}
prc_files = {int(re.findall(r'(\d+).dgl', prc)[0]) for prc in os.listdir(chr_processed_path)}
all_files = raw_files | prc_files
n_have = max(all_files) + 1 if all_files else 0
if n_need <= n_have:
continue
n_diff = n_need - n_have
print(f'SETUP - simulate: Simulate {n_diff} datasets for {chrN_flag} with PBSIM3')
for i in range(n_diff):
idx = n_have + i
chr_save_path = os.path.join(chr_raw_path, f'{idx}.fasta')
print(f'\nStep {i}: Simulating reads {chr_save_path}')
# Use the CHM13/HG002 profile for all the chromosomes
if f'sample_profile_{sample_profile_id}.fastq' not in os.listdir(pbsim3_dir):
assert os.path.isfile(sample_file_path), "Sample profile ID and sample file not found! Provide either a valid sample profile ID or a sample file."
subprocess.run(f'./src/pbsim --strategy wgs --method sample --depth {depth} --genome {chr_seq_path} ' \
f'--sample {sample_file_path} '
f'--sample-profile-id {sample_profile_id} --prefix {chr_raw_path}/{idx}', shell=True, cwd=pbsim3_dir)
else:
subprocess.run(f'./src/pbsim --strategy wgs --method sample --depth {depth} --genome {chr_seq_path} ' \
f'--sample-profile-id {sample_profile_id} --prefix {chr_raw_path}/{idx}', shell=True, cwd=pbsim3_dir)
handle_pbsim_output(idx, chrN, chr_raw_path)
# 2. Generate the graphs
def generate_graphs_hifi(datadir_path, chr_dict, assembler, threads):
print(f'SETUP - generate')
datadir_path = os.path.abspath(datadir_path)
for chrN_flag, n_need in chr_dict.items():
if chrN_flag.endswith('_hg002'):
chrN = chrN_flag[:-6]
chr_sim_path = os.path.join(datadir_path, f'{chrN}')
else:
print(f'Give valid suffix')
raise Exception # TODO: Implement custom exception
chr_raw_path = os.path.join(chr_sim_path, 'raw')
chr_prc_path = os.path.join(chr_sim_path, f'{assembler}/processed')
n_raw = len(os.listdir(chr_raw_path))
n_prc = len(os.listdir(chr_prc_path))
n_diff = max(0, n_raw - n_prc)
if n_diff > 0:
print(f'SETUP - generate: Generate {n_diff} graphs for {chrN}')
graph_dataset.AssemblyGraphDataset_HiFi(chr_sim_path, assembler=assembler, threads=threads, generate=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--datadir', type=str, help='path to directory where the generated data will be saved')
parser.add_argument('--chrdir', type=str, help='path to directory where the chromosome references are stored')
parser.add_argument('--asm', type=str, help='assembler used for the assembly graph construction [hifiasm|raven]')
parser.add_argument('--threads', type=int, default=1, help='number of threads used for running the assembler')
args = parser.parse_args()
chrs_path = args.chrdir
assembler = args.asm
datadir_path = args.datadir
threads = args.threads
config = get_config()
pbsim3_dir = config['pbsim3_dir']
sample_profile_id = config['sample_profile_ID']
assert len(sample_profile_id) > 0, "You need to specify sample_profile_id!"
sample_file = config['sample_file']
seq_depth = config['sequencing_depth']
train_chr, valid_chr = train_valid_chrs.get_train_valid_chrs()
all_chr = merge_dicts(train_chr, valid_chr)
simulate_reads_hifi(datadir_path, chrs_path, all_chr, assembler, pbsim3_dir, sample_profile_id, sample_file, depth=seq_depth)
generate_graphs_hifi(datadir_path, all_chr, assembler, threads)