-
Notifications
You must be signed in to change notification settings - Fork 0
/
metacleaner.py
363 lines (337 loc) · 16.2 KB
/
metacleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env python
import sys
import getopt
import datetime
import os.path
import shutil
import subprocess
import os
import time
import glob
import pandas as pd
import numpy as np
from Bio import SeqIO
def myfunc(argv):
global arg_query
arg_query = ""
global arg_outdir
arg_outdir = os.getcwd()
global arg_blastdbdir
arg_blastdbdir = ""
global arg_badblastdb
arg_badblastdb = ""
global arg_goodblastdb
arg_goodblastdb = ""
global arg_badblastdbinput
arg_badblastdbinput = "NONE"
global arg_goodblastdbinput
arg_goodblastdbinput = "NONE"
global arg_chunks
arg_chunks = "100"
global arg_pident
arg_pident = "100"
global arg_qcovs
arg_qcovs = "100"
global arg_threads
arg_threads = "1"
global arg_sortonly
arg_sortonly = "F"
global arg_filterlevel
arg_filterlevel = "genus"
global arg_taxadb
arg_taxadb = ""
global arg_addfilter
arg_addfilter = "NONE"
arg_help = "\nmetacleaner options:\
\n-q, --query <(required) path to query fasta file>\
\n-o, --outdir <path to output directory>\
\n-e, --pident <(default=100) percent identity threshold for filtering>\
\n-v, --qcovs <(default=100) query cover threshold for filtering>\
\n-s, --sortonly <(default=F), T/F: start at sorting step (requires previously generated blastn output files in output directory specified by -o)>\
\n-l, --filterlevel <(default=genus) one of superkingdom, phylum, class, order, family, genus, or species>\
\n-b, --blastdbdir <(required) path to badblastdb directory>\
\n-x, --badblastdb <(required) badblastdb name>\
\n-f, --badblastdbinput <path to fasta file for badblastdb (required if badblastdb does not already exist in directory specified by -b)>\
\n-y, --goodblastdb <(required) goodblastdb name>\
\n-g, --goodblastdbinput <path to fasta file for goodblastdb (required if goodblastdb does not already exist in directory specified by -b)>\
\n-d, --taxadb <(required unless constructing from scratch) path to directory containing accessionTaxa.sql file for taxonomizr>\
\n-c, --chunks <(default=100) number of chunks to split query file into (higher values may increase speed for larger query files)>\
\n-t, --threads <(default=1) number of threads for blastn>\
\n-w, --addfilter <Additional filter level, one of taxa levels as in -l (--filterlevel) and filter value, separated by a comma;\
\n hits at pident and qcovs with taxa info other than in addfilter will be removed>".format(argv[0])
try:
opts, args = getopt.getopt(argv[1:], "hq:q:b:x:y:f:g:o:c:e:v:t:s:l:d:w:", ["help", "query=", "blastdbdir=", "badblastdb=", \
"goodblastdb=", "badblastdbinput=", "goodblastdbinput=", "outdir=", "chunks=", "pident=", \
"qcovs=", "threads=", "sortonly=", "filterlevel==","taxadb==","addfilter=="])
except:
print(arg_help)
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print(arg_help) # print the help message
sys.exit(2)
elif opt in ("-q", "--query"):
arg_query = arg
elif opt in ("-b", "--blastdbdir"):
arg_blastdbdir = arg
elif opt in ("-x", "--badblastdb"):
arg_badblastdb = arg
elif opt in ("-y", "--goodblastdb"):
arg_goodblastdb = arg
elif opt in ("-f", "--badblastdbinput"):
arg_badblastdbinput = arg
elif opt in ("-g", "--goodblastdbinput"):
arg_goodblastdbinput = arg
elif opt in ("-o", "--outdir"):
arg_outdir = arg
elif opt in ("-c", "--chunks"):
arg_chunks = str(arg)
elif opt in ("-e", "--pident"):
arg_pident = str(arg)
elif opt in ("-v", "--qcovs"):
arg_qcovs = str(arg)
elif opt in ("-t", "--threads"):
arg_threads = str(arg)
elif opt in ("-s", "--sortonly"):
arg_sortonly = arg
elif opt in ("-l", "--filterlevel"):
arg_filterlevel = arg
elif opt in ("-d", "--taxadb"):
arg_taxadb = arg
elif opt in ("-w", "--addfilter"):
arg_addfilter = arg
print('----------------------------------')
if not os.stat(arg_query).st_size == 0:
print(datetime.datetime.now())
print('> Runnning metacleaner with options:')
print('query =', arg_query)
print('blastdbdir =', arg_blastdbdir)
print('badblastdb =', arg_badblastdb)
print('badblastdbinput =', arg_badblastdbinput)
print('goodblastdb =', arg_goodblastdb)
print('goodblastdbinput =', arg_goodblastdbinput)
print('outdir =', arg_outdir)
print('chunks =', arg_chunks)
print('pident =', arg_pident)
print('qcovs =', arg_qcovs)
print('threads =', arg_threads)
print('sortonly =', arg_sortonly)
print('filterlevel =', arg_filterlevel)
print('taxadb =', arg_taxadb)
print('addfilter =', arg_addfilter)
print('----------------------------------')
else:
print("Error: query file specified by -q does not exist.")
sys.exit(2)
def check_badblastdb():
# badblastdb
fname = arg_blastdbdir + "/" + arg_badblastdb + ".nhd"
if os.path.isfile(fname):
print(arg_badblastdb + " exists in directory specified by -b")
else:
if shutil.which("makeblastdb") is not None:
badblastdb = arg_blastdbdir + "/" + arg_badblastdb
if os.path.isfile(arg_badblastdbinput):
print("badblastdb "+ arg_badblastdb + " does not exist in directory specified by -b. Attempting to construct:")
subprocess.call(['makeblastdb', '-in', arg_badblastdbinput, '-out', badblastdb, '-dbtype',\
'nucl', '-hash_index'])
else:
print("badblastdb " + arg_badblastdb + " does not exist in directory specified by -b, but fasta file specified by -f does not exist.")
sys.exit(2)
else:
print("Error: BLAST command line tools are not installed.")
sys.exit(2)
def check_goodblastdb():
fname = arg_blastdbdir + "/" + arg_goodblastdb + ".nhd"
if os.path.isfile(fname):
print(arg_goodblastdb + " exists in directory specified by -b")
else:
if shutil.which("makeblastdb") is not None:
goodblastdb = arg_blastdbdir + "/" + arg_goodblastdb
if os.path.isfile(arg_goodblastdbinput):
print("goodblastdb "+ arg_goodblastdb + " does not exist in directory specified by -b. Attempting to construct:")
subprocess.call(['makeblastdb', '-in', arg_goodblastdbinput, '-out', goodblastdb, '-dbtype', \
'nucl', '-hash_index'])
else:
print("goodblastdb " + arg_goodblastdb + " does not exist in directory specified by -b, but fasta file specified by -f does not exist.")
sys.exit(2)
else:
print("Error: BLAST command line tools are not installed.")
sys.exit(2)
def split_fasta(file_in,file_out):
print("> Preprocessing query file")
if shutil.which("pyfasta") is not None:
print('Splitting query fasta file into ' + arg_chunks + " chunks")
subprocess.call(['cp', file_in, file_out])
subprocess.call(['pyfasta', 'split', '-n', arg_chunks, file_out], \
stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
subprocess.call(['rm', file_out])
else:
print('Error: pyfasta must be installed. Do \'pip install pyfasta\'.')
sys.exit(2)
def badblastn():
extlen = len(arg_query.split('.', 1))
ext = arg_query.split('.', 1)[extlen-1]
fpath = tempdir + "/*." + ext
print("> Searching for query sequences in "+arg_badblastdb)
flist = glob.glob(fpath)
chunk = 0
for i in flist:
if not os.stat(i).st_size == 0:
chunk = chunk + 1
chunkout = tempdir + "/blastn_chunk" + str(chunk) + ".out1"
print("Running blastn search of query chunk " + str(chunk)+" | "+str(datetime.datetime.now()))
blastdbpath = arg_blastdbdir+"/"+arg_badblastdb
subprocess.call(['blastn', '-db', blastdbpath, '-query', i, '-out', chunkout, \
'-outfmt', '6 qseqid pident qcovs sseqid', '-num_threads', arg_threads, '-max_target_seqs', "1"],stderr=subprocess.DEVNULL)
else:
subprocess.call(['rm', i])
filenames = glob.glob(tempdir+"/*.out1")
with open(arg_outdir+"/blastn_"+arg_badblastdb+"_output.tsv", 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
def badsort():
print("> Filtering accessions from query due to hits with "+arg_pident+" identity and "\
+arg_qcovs+" coverage on sequences in "+arg_badblastdb+":")
badseqids = []
tophit = []
with pd.read_csv(arg_outdir+"/blastn_"+arg_badblastdb+"_output.tsv", sep='\t', header=None, chunksize=10000) as reader:
for chunk in reader:
chunk.columns = ['qseqid', 'pident', 'qcovs', 'sseqid']
qseqids = chunk.qseqid.unique()
for i in qseqids:
chunk_sub = chunk[chunk["qseqid"].isin([i])].reset_index(drop=True)
if chunk_sub[(chunk_sub["pident"]>=float(arg_pident)) & \
(chunk_sub["qcovs"]>=float(arg_qcovs))].shape[0] > 0:
badseqids.append(i)
tophit.append(chunk_sub.iloc[0]['sseqid'])
print(">>> "+str(len(badseqids))+" accession(s) filtered")
badseqout = pd.DataFrame({'badseqid' : badseqids, 'tophit' : tophit, 'reason' : ["in "+arg_badblastdb] * len(badseqids)})
badseqout.to_csv(tempdir+"/"+fileout+"_badseqids.txt", sep='\t', header=True, index=False)
return badseqids
def filter_fasta_badseqids(badseqids):
headers = []
with open(arg_query, "r") as f:
for record in SeqIO.parse(f, "fasta"):
headers.append(record.description)
goodseqids = set([x for x in headers if x not in badseqids])
print("> Saving temporary fasta file: of "+str(len(headers))+" query accession(s), "\
+str(len(badseqids))+" accession(s) filtered, "+str(len(goodseqids))+" accession(s) retained")
query_filtered = (r for r in SeqIO.parse(arg_query, "fasta") if r.id in goodseqids)
SeqIO.write(query_filtered,tempdir+"/"+fileout+"_nobadseqids.fasta","fasta")
def goodblastn():
extlen = len(arg_query.split('.', 1))
ext = arg_query.split('.', 1)[extlen-1]
fpath = tempdir2 + "/*." + ext
print("> Searching for query sequences in "+arg_goodblastdb)
flist = glob.glob(fpath)
chunk = 0
for i in flist:
if not os.stat(i).st_size == 0:
chunk = chunk + 1
chunkout = tempdir2 + "/blastn_chunk" + str(chunk) + ".out2"
print("Running blastn search of query chunk " + str(chunk)+" | "+str(datetime.datetime.now()))
blastdbpath = arg_blastdbdir+"/"+arg_goodblastdb
subprocess.call(['blastn', '-db', blastdbpath, '-query', i, '-out', chunkout, \
'-outfmt', '6 qseqid pident qcovs sseqid', '-num_threads', arg_threads, '-max_target_seqs', "10"],stderr=subprocess.DEVNULL)
else:
subprocess.call(['rm', i])
filenames = glob.glob(tempdir2+"/*.out2")
with open(arg_outdir+"/blastn_"+arg_goodblastdb+"_output.tsv", 'w') as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
def goodsort():
print("> Filtering accessions from query due to hits with < "+arg_pident+" identity and < "\
+arg_qcovs+" coverage on sequences in "+arg_goodblastdb+":")
badseqids = []
badtophit = []
goodseqids = []
goodtophit = []
with pd.read_csv(arg_outdir+"/blastn_"+arg_goodblastdb+"_output.tsv", sep='\t', header=None, chunksize=10000) as reader:
for chunk in reader:
chunk.columns = ['qseqid', 'pident', 'qcovs', 'sseqid']
qseqids = list(set(chunk.qseqid.unique()) - set(badseqids + goodseqids))
for i in qseqids:
chunk_sub = chunk[chunk["qseqid"].isin([i])].reset_index(drop=True)
if chunk_sub[(chunk_sub['pident']>=float(arg_pident)) & \
(chunk_sub["qcovs"]>=float(arg_qcovs))].shape[0] > 0:
goodseqids = goodseqids + list(chunk_sub[(chunk_sub['pident']>=float(arg_pident)) & \
(chunk_sub["qcovs"]>=float(arg_qcovs))]['qseqid'])
goodtophit = goodtophit + list(chunk_sub[(chunk_sub['pident']>=float(arg_pident)) & \
(chunk_sub["qcovs"]>=float(arg_qcovs))]['sseqid'])
else:
badseqids.append(i)
badtophit.append(chunk_sub.iloc[0]['sseqid'])
print(">>> "+str(len(badseqids))+" accession(s) flagged")
badseqout = pd.DataFrame({'badseqid' : badseqids, 'tophit' : badtophit, 'reason' : ["not in "+arg_goodblastdb] * len(badseqids)})
goodseqout = pd.DataFrame({'goodseqid' : goodseqids, 'tophit' : goodtophit})
oldbadseqids = pd.read_csv(tempdir+"/"+fileout+"_badseqids.txt", sep='\t')
newbadseqids = pd.concat([oldbadseqids,badseqout])
newbadseqids.to_csv(tempdir+"/"+fileout+"_badseqids.txt", sep='\t', header=True, index=False)
goodseqout.to_csv(tempdir+"/"+fileout+"_goodseqids.txt", sep='\t', header=True, index=False)
def taxafilter():
scriptdir = os.path.realpath(os.path.dirname(__file__))
print("> Collating taxonomy information and filtering sequences with mislabeled "+arg_filterlevel)
print("taxafilter.R is in "+scriptdir)
subprocess.call(['Rscript', scriptdir+'/taxafilter.R', arg_taxadb+"/accessionTaxa.sql",\
tempdir+"/"+fileout+"_badseqids.txt", tempdir+"/"+fileout+"_goodseqids.txt",\
arg_outdir, arg_filterlevel, fileout, arg_badblastdb, arg_goodblastdb, arg_addfilter])
def filter_fasta_goodseqids():
headers = []
with open(arg_query, "r") as f:
for record in SeqIO.parse(f, "fasta"):
headers.append(record.description)
badseqids = list(pd.read_csv(tempdir+"/"+fileout+"_badseqids.txt", sep='\t', header=0)['badseqid'])
goodseqids = set([x for x in headers if x not in badseqids])
print("> Saving cleaned fasta file: of "+str(len(headers))+" query accession(s), "\
+str(len(badseqids))+" accession(s) filtered, "+str(len(goodseqids))+" accession(s) retained")
query_filtered = (r for r in SeqIO.parse(arg_query, "fasta") if r.id in goodseqids)
SeqIO.write(query_filtered,arg_outdir+"/"+fileout+"_clean.fasta","fasta")
newheaders = []
with open(arg_outdir+"/"+fileout+"_clean.fasta", "r") as f:
for record in SeqIO.parse(f, "fasta"):
newheaders.append(record.description)
cleantax = pd.read_csv(arg_outdir+"/"+fileout+"_clean.tax", header=None)
cleantax['qseqidcat'] = pd.Categorical(cleantax[1], categories=newheaders, ordered=True)
cleantax.sort_values('qseqidcat', inplace=True)
cleantax = cleantax.drop('qseqidcat', axis=1)
cleantax.to_csv(arg_outdir+"/"+fileout+"_clean.tax", header=None, index=None)
def cleanup():
subprocess.call(['cp', tempdir+"/"+fileout+"_badseqids.txt", arg_outdir+"/"+fileout+"_badseqids.txt"])
if arg_sortonly=="F":
subprocess.call(['rm', '-r', tempdir])
else:
subprocess.call(['rm', tempdir+"/"+fileout+"_goodseqids.txt"])
subprocess.call(['rm', '-r', tempdir2])
if __name__ == "__main__":
myfunc(sys.argv)
global tempdir
global tempdir2
global fileout
head, tail = os.path.split(arg_query)
fileout = tail.split('.', 1)[0]
if arg_sortonly=="T":
tempdir = arg_outdir
if arg_sortonly=="F":
tempdir = arg_outdir + "/" + "temp_" + str(time.time()).split('.', 1)[0]
print('Creating temporary directory at: ' + tempdir)
os.makedirs(tempdir)
check_badblastdb()
check_goodblastdb()
split_fasta(arg_query,tempdir+"/"+fileout+".fasta")
badblastn()
badseqids = badsort()
filter_fasta_badseqids(badseqids)
tempdir2 = tempdir + "/" + "temp_" + str(time.time()).split('.', 1)[0]
os.makedirs(tempdir2)
if arg_sortonly=="F":
split_fasta(tempdir+"/"+fileout+"_nobadseqids.fasta",tempdir2+"/"+fileout+"_nobadseqids.fasta")
goodblastn()
goodsort()
taxafilter()
filter_fasta_goodseqids()
cleanup()