-
Notifications
You must be signed in to change notification settings - Fork 1
/
subcluster_checking_assembling_and_analysing.py
237 lines (209 loc) · 10.8 KB
/
subcluster_checking_assembling_and_analysing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import os
import time
import datetime
import sys
import pprint
def get_cluster_dict(path_to_drep_cluster_table):
with open(path_to_drep_cluster_table,'r') as f:
cluster_table=f.readlines()
for i in range(len(cluster_table)):
cluster_table[i]=cluster_table[i].rstrip('\n')
cluster_dict={}
for i in cluster_table[1:]:
j=i.split(',')
if j[1] not in cluster_dict:
cluster_dict[j[1]]=[j[0]]
else:
cluster_dict[j[1]].append(j[0])
print(f'Removing {len([j for j in cluster_dict if len(cluster_dict[j])<2])} <one member> clusters')
[cluster_dict.pop(i) for i in [j for j in cluster_dict if len(cluster_dict[j])<2]]
print(f'Total left {len(cluster_dict)} clusters')
return cluster_dict
def check_subcluster_animf99(cluster_dict,path_to_checkm_result,path_to_mag):
if cluster_dict:
failed_count=0
for cluster in cluster_dict:
os.system("mkdir drep")
with open('./drep/sample_hmq.tsv','w') as f:
for sample in cluster_dict[cluster]:
f.writelines(f'{path_to_mag}/{sample}\n')
os.system('echo "genome,completeness,contamination,strain_heterogeneity" > ./drep/sample_stat.csv;sed "s/\t/,/g" %s > checkm;\
for i in $(cat ./drep/sample_hmq.tsv | awk -F "/" \'{print $NF}\' | sed "s/.fa//g");do grep "$i," checkm \
| awk -F "," \'{print $1".fa",$12,$13,$14}\' | sed "s/ /,/g";done >> ./drep/sample_stat.csv;rm checkm' % (path_to_checkm_result))
with open('./drep/drep.sh','w') as f:
f.write("#!/bin/bash\n\
dRep dereplicate \\\n\
./ \\\n\
--processors 20 \\\n\
--length 50000 \\\n\
--completeness 75 \\\n\
--contamination 25 \\\n\
--genomes sample_hmq.tsv \\\n\
--genomeInfo sample_stat.csv \\\n\
--S_algorithm fastANI \\\n\
--P_ani 0.9 \\\n\
--S_ani 0.99 \\\n\
--cov_thresh 0.1 \\\n\
> drep.log 2>&1")
os.system("chmod +x ./drep/drep.sh")
text=f'Processing dRep of cluster {cluster} ...\n'
for words in text:
sys.stdout.write(words)
sys.stdout.flush()
time.sleep(0.01)
start=datetime.datetime.now()
while os.system("cd drep/;bash drep.sh")==0:
print(f'Cluster {cluster} finished dRep process!')
break
f=os.popen("ls ./drep/dereplicated_genomes/ | wc -l")
if f.read()[0]=='1':
print(f'Cluster {cluster} passed examination!')
failed=False
else:
print(f'There are multiple subsets in cluster {cluster}')
failed=True
if failed:
with open('./failed_cluster.log','a') as f:
f.write(f'\n<<<<<<<<<<Cluster {cluster}>>>>>>>>>>\n')
os.system("cat ./drep/data_tables/Cdb.csv >> ./failed_cluster.log")
failed_count=failed_count+1
with open('./all_drep.log','a') as f:
f.write(f'<<<<<<<<<<Cluster {cluster}>>>>>>>>>>\n')
os.system("cat ./drep/sample_stat.csv ./drep/drep.log >> ./all_drep.log;\
ls ./drep/dereplicated_genomes/ >>./all_drep.log;\
rm -rf ./drep/")
end=datetime.datetime.now()
running_time=end-start
with open('./all_drep.log','a') as f:
f.write(f'Running time: {running_time}\n\n')
print("Running time: ",running_time)
print("----------\n")
if failed_count:
print(f'All checking process finished, {failed_count} clusters failed. Checking all_drep.log and failed_cluster.log for more details.')
else:
print("All checking process finished, all clusters passed. Checking all_drep.log for more details.")
else:
print("Checking process failed! There are no clusters with at least 2 members in the cluster_dict!")
def make_RG_statistical_file(path_to_drep_RG_dir,cluster_dict,path_to_checkm_result,path_to_mag):
drep_RG=[]
drep_RG=os.listdir(path_to_drep_RG_dir)
if '.ipynb_checkpoints' in drep_RG:
drep_RG.remove('.ipynb_checkpoints')
# print(len(drep_RG))
RG_to_cluster=[]
for cluster in cluster_dict:
for sample in cluster_dict[cluster]:
if sample in drep_RG:
RG_to_cluster.append([sample,cluster])
drep_RG.remove(sample)
break
with open('RG_to_cluster.tsv','w') as f:
f.write('RG,Cluster,\n')
for i in RG_to_cluster:
f.write(','.join(i)+',\n')
text=f'Writting RG\'s statistical file to RG_stat.csv ...\n'
for words in text:
sys.stdout.write(words)
sys.stdout.flush()
time.sleep(0.01)
if os.system('echo "completeness,contamination,strain_heterogeneity" > RG_stat.tsv;sed "s/\t/,/g" %s > checkm;\
for i in $(cat RG_to_cluster.tsv | cut -d "," -f1 | sed -n \'2,$p\' | sed "s/.fa//g");\
do grep "$i," checkm | cut -d "," -f12-;done >> RG_stat.tsv;\
paste -d "" RG_to_cluster.tsv RG_stat.tsv > RG_stat.csv;rm RG_stat.tsv RG_to_cluster.tsv checkm' % path_to_checkm_result):
print("Writting failed! Please checking your script and running again!")
else:
print("Writting finished! Checking RG_stat.csv for more details!")
def subcluster_flye_assembling_and_checkm_checking(path_to_drep_RG_dir,cluster_dict,path_to_checkm_result,path_to_mag):
if cluster_dict:
os.system("mkdir -p ./flye/assemblies/")
os.system("mkdir -p ./flye/logs/")
make_RG_statistical_file(path_to_drep_RG_dir,cluster_dict,path_to_checkm_result,path_to_mag)
os.system("mv RG_stat.csv ./flye/")
with open('./flye/assembly_stat.csv','a') as f:
f.write("Genome,Total_length,Fragments,N50,Largest_frg,Scaffolds,Mean_coverage\n")
succeed=True
for cluster in cluster_dict:
os.system("mkdir ./flye_temp/")
with open('./flye_temp/flye.sh','a') as f:
f.write("#!/bin/bash\nflye --subassemblies \\\n")
for sample in cluster_dict[cluster]:
sample_id=sample.split('.')[0]
f.writelines(f'{path_to_mag}/{sample} \\\n')
f.write("--threads 20 \\\n-o ./")
start=datetime.datetime.now()
text=f'Assembling cluster {cluster}\'s MAGs using flye ...\n'
for words in text:
sys.stdout.write(words)
sys.stdout.flush()
time.sleep(0.05)
if os.system("cd ./flye_temp/;chmod +x flye.sh;bash flye.sh>out.log 2>&1"):
print(f'Cluster {cluster}\'s flye subassemblies processing failed! Checking ./flye_temp/out.log and ./flye_temp/flye.log for more details.')
succeed=False
break
else:
print(f'Cluster {cluster}\'s flye subassemblies processing successed!')
os.system(f'cd ./flye_temp/;\
tail -8 flye.log | head -6 | cut -d ":" -f2 | xargs echo "{cluster}.fasta" >>../flye/assembly_stat.csv;\
mv assembly.fasta ../flye/assemblies/{cluster}.fasta;\
cat flye.log >../flye/logs/{cluster}.log')
os.system("rm -rf ./flye_temp/")
end=datetime.datetime.now()
running_time=end-start
print("Running time: ",running_time)
print("----------\n")
os.system("sed -i 's/ /,/g' ./flye/assembly_stat.csv")
print('All flye processes finished! Checking ./flye/ or ./flye_temp/(if failed) for assemblies, statistical table and logs.')
if succeed:
start=datetime.datetime.now()
text=f'Starting checkm process ...\n'
for words in text:
sys.stdout.write(words)
sys.stdout.flush()
time.sleep(0.2)
if os.system("cd ./flye/assemblies/;checkm lineage_wf --tab_table --file ../checkm_assemblies.tsv -x fasta -t 20 ./ ../checkm_assemblies_output"):
print('CheckM processing failed! Checking ./flye/checkm_assemblies_output/checkm.log for more details')
else:
print('CheckM processing finished! Checking ./flye/checkm_assemblies.tsv for results.')
end=datetime.datetime.now()
running_time=end-start
print("Running time: ",running_time)
print("----------\n")
else:
print('Subassemblies processing failed, automatically shutting down the checkM process!')
print('All flye and checkm processes finished! Checking ./flye for more details.')
else:
print("Subcluster assembling process failed! There are no clusters with at least 2 members in the cluster_dict!")
def main():
path_to_drep_cluster_table='/store/yhshao/META/perfect_cluster_result9_ani95_drep/data_tables/Cdb.csv'
path_to_drep_RG_dir='/store/yhshao/META/perfect_cluster_result9_ani95_drep/dereplicated_genomes/'
path_to_checkm_result='/store/yhshao/META/metapi_result_cami/link_to_mag_vamb_no_graphbin/checkm_output/checkm.tsv'
path_to_mag='/store/yhshao/META/metapi_result_cami/link_to_mag_vamb_no_graphbin'
cluster_dict=get_cluster_dict(path_to_drep_cluster_table)
pprint.pprint(cluster_dict)
check_subcluster_animf99(cluster_dict,path_to_checkm_result,path_to_mag)
subcluster_flye_assembling_and_checkm_checking(path_to_drep_RG_dir,cluster_dict,path_to_checkm_result,path_to_mag)
if __name__=='__main__':
main()
#Example output of this workflow:
# Removing 0 <one member> clusters
# Total left 1 clusters
# {'1_1': ['CAMI_UT_2.metaspades.vamb.bin.2.fa',
# 'CAMI_UT_22.metaspades.concoct.bin.21.fa']}
# Processing dRep of cluster 1_1 ...
# Cluster 1_1 finished dRep process!
# Cluster 1_1 passed examination!
# Running time: 0:00:32.611636
# ----------
# All checking process finished, all clusters passed. Checking all_drep.log for more details.
# Writting RG's statistical file to RG_stat.csv ...
# Writting finished! Checking RG_stat.csv for more details!
# Assembling cluster 1_1's MAGs using flye ...
# Cluster 1_1's flye subassemblies processing successed!
# Running time: 0:00:19.440714
# ----------
# All flye processes finished! Checking ./flye/ or ./flye_temp/(if failed) for assemblies, statistical table and logs.
# Starting checkm process ...
# CheckM processing finished! Checking ./flye/checkm_assemblies.tsv for results.
# Running time: 0:06:32.077641
# ----------
# All flye and checkm processes finished! Checking ./flye for more details.