Skip to content

Commit

Permalink
Functionality to compare consensus sequence to reference
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmero committed Jun 25, 2024
1 parent ad84138 commit b60d9ee
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 0 deletions.
111 changes: 111 additions & 0 deletions bin/align_pairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
'''
Module : Align sequence pairs
Description : Take two fasta files and perform pairwise-alignment
on each sequence pair
Copyright : (c) WEHI Genomics R&D, 2024
License : TBD
Maintainer : Marek Cmero
Portability : POSIX
'''

import sys
import os
import edlib
import logging
import pyfastx as fx
from datetime import datetime
from argparse import ArgumentParser

debug = False
BASE_BUFFER_LEN = 50


def parse_args():
'''Parse arguments'''
description = '''
Take two fasta files and perform pairwise-alignment
on each sequence pair.
Usage:
python align_sequence_pairs.py query.fasta target.fasta
Output:
Returns aligned sequence pairs to stdout
'''
parser = ArgumentParser(description=description)
parser.add_argument('query_fasta',
type=str,
help='Input query FASTA file')
parser.add_argument('target_fasta',
type=str,
help='Input target FASTA file')
parser.add_argument('--debug',
action='store_true',
help='Enable debug logging')

return parser.parse_args()


def init_logging(log_filename):
'''
Initiate logging
'''
logging.basicConfig(filename=log_filename,
level=logging.DEBUG,
filemode='w',
format='%(asctime)s %(levelname)s - %(message)s',
datefmt="%Y-%m-%dT%H:%M:%S%z")
logging.info('Align sequence pairs started')


def main():
'''Main'''
args = parse_args()

if not os.path.exists(args.query_fasta) or \
not os.path.exists(args.target_fasta):
sys.exit('Error: FASTA file(s) do not exist')

if args.debug:
global debug
debug = True
init_logging('rotate_{:%Y-%m-%d_%H%M}.log'.format(datetime.now()))

queries = fx.Fasta(args.query_fasta, build_index=False)
targets = fx.Fasta(args.target_fasta, build_index=False)

target_dict = {}
for target in targets:
name, seq = target
target_dict[name] = seq

for query in queries:
name, seq = query

if name not in target_dict:
logging.warning(f'Query {name} not found in target')
continue

target_seq = target_dict[name]
result = edlib.align(seq, target_seq, mode='HW', task='path')
result = edlib.getNiceAlignment(result, seq, target_seq)

print('----------------------------------------')
print(f'{name}')
print('----------------------------------------')
align_len = len(result['query_aligned'])
for i in range(0, align_len, BASE_BUFFER_LEN):
query_seq = result['query_aligned'][i:i+BASE_BUFFER_LEN]
matched_seq = result['matched_aligned'][i:i+BASE_BUFFER_LEN]
target_seq = result['target_aligned'][i:i+BASE_BUFFER_LEN]
print(f'{name}_QUERY\t{query_seq}')
print(f'{name}_MATCH\t{matched_seq}')
print(f'{name}_TARGT\t{target_seq}\n')

if debug:
logging.info('Align sequence pairs finished')


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions envs/biopython.yaml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ dependencies:
- python-edlib
- pandas
- natsort
- pyfastx

27 changes: 27 additions & 0 deletions modules/align_pairs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
process AlignPairs {
tag "${consensusFasta.getSimpleName()}"
label = "AlignPairs"

publishDir "${params.outdir}/consensus/${sampleName}", mode: 'copy'

conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
params.conda_env_location + '/biopython' :
projectDir + '/envs/biopython.yaml' }"

input:
tuple val(sampleName), path(consensusFasta)
path(reference)

output:
tuple val(sampleName), path("*.txt"), emit: pair_alignments

script:
def prefix = task.ext.prefix ?: consensusFasta.getSimpleName()
"""
if [ -n "\$(gunzip < ${consensusFasta} | head -c 1 | tr '\0\n' __)" ]; then
align_pairs.py ${consensusFasta} ${reference} > ${prefix}_ref_align.txt
else
touch ${prefix}_ref_align.txt
fi
"""
}
6 changes: 6 additions & 0 deletions subworkflows/consensus.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include { Racon } from '../modules/racon'
include { Medaka } from '../modules/medaka'
include { AlignPairs } from '../modules/align_pairs.nf'

process PrepareForConsensus {
label = "PrepareForConsensus"
Expand Down Expand Up @@ -41,6 +42,11 @@ workflow Consensus {

consensus_sequences = Medaka.out.assembly

AlignPairs(consensus_sequences, reference)

pair_alignments = AlignPairs.out.pair_alignments

emit:
consensus_sequences
pair_alignments
}

0 comments on commit b60d9ee

Please sign in to comment.