Skip to content

Commit

Permalink
fixed the bug in filtering results when core and ns5b contig is joine…
Browse files Browse the repository at this point in the history
…d together
  • Loading branch information
sherrie9 committed Jun 29, 2023
1 parent 85afb27 commit 1064028
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions bin/genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,12 @@ def filter_alignments(output, blast_out, min_cov, min_id):
blast_results.to_csv(os.path.join(output, output + '_blast_results_prefilter.csv'))
# Keep only best alingments for each contig (by bitscore)
blast_results = blast_results[(blast_results['qlen'] >= 300)]
best_bitscores = blast_results[['qseqid', 'bitscore']].groupby('qseqid').max().reset_index()
blast_results = pd.merge(blast_results, best_bitscores, on=['qseqid', 'bitscore'])
best_bitscores = blast_results[['qseqid', 'bitscore','amplicon']].groupby(['qseqid','amplicon']).max().reset_index()
blast_results = pd.merge(blast_results, best_bitscores, on=['qseqid', 'bitscore','amplicon'])
best_bitscores = blast_results[['amplicon','subtype', 'bitscore']].groupby(['amplicon','subtype']).max().reset_index()
blast_results = pd.merge(blast_results, best_bitscores, on=['amplicon','subtype', 'bitscore'])
#best_bitscores = blast_results.groupby(['amplicon','subtype']).head(1).reset_index()

#only allow one contig to one subtype, with the same bitscore
cols=['amplicon','subtype','bitscore']
blast_results = blast_results.sort_values(by=['bitscore'],ascending=False)
Expand Down Expand Up @@ -356,6 +358,7 @@ def write_best_contigs_fasta(output, blast_results, contigs):
'''Looks up best contigs in contigs FASTA file and writes them to their own FASTA file.
Returns path to best contigs FASTA file.'''
# De-duplicate rows from contigs with best alignments to multiple ref seqs
#TODO: add a condition to not write the contig if the same sequence already
cols = ['qseqid', 'segment','subtype', 'slen','qstart','qend','amplicon']

blast_results = blast_results[cols].drop_duplicates(subset=['qseqid', 'segment','subtype', 'slen','amplicon'], keep='first')
Expand All @@ -375,15 +378,23 @@ def write_best_contigs_fasta(output, blast_results, contigs):
# Write best contigs to FASTA
with open(best_contigs, 'w') as output_file:
contig_counter = 1
seqs_to_write = {}
for index in blast_results.index:
contig_name = blast_results['qseqid'][index]
segment = blast_results['segment'][index]
subtype = blast_results['subtype'][index]
segment_length = blast_results['slen'][index]
amplicon = blast_results['amplicon'][index]
header = f'>{output}_seq_{contig_counter}|{amplicon}|{segment}|{subtype}|{segment_length}'
if contig_seqs[contig_name] in seqs_to_write.values() and len(contig_seqs[contig_name]) < 600 :
continue


seqs_to_write[contig_name] = ''
seqs_to_write[contig_name] += contig_seqs[contig_name]
output_file.write(header + '\n')
output_file.write(contig_seqs[contig_name] + '\n')

contig_counter += 1
return best_contigs

Expand Down

0 comments on commit 1064028

Please sign in to comment.