Functionality to compare consensus sequence to reference

WEHIGenomicsRnD · Jun 25, 2024 · b60d9ee · b60d9ee
1 parent ad84138
commit b60d9ee
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 0 deletions.
diff --git a/bin/align_pairs.py b/bin/align_pairs.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+'''
+Module : Align sequence pairs
+Description : Take two fasta files and perform pairwise-alignment
+ on each sequence pair
+Copyright : (c) WEHI Genomics R&D, 2024
+License : TBD
+Maintainer : Marek Cmero
+Portability : POSIX
+'''
+
+import sys
+import os
+import edlib
+import logging
+import pyfastx as fx
+from datetime import datetime
+from argparse import ArgumentParser
+
+debug = False
+BASE_BUFFER_LEN = 50
+
+
+def parse_args():
+ '''Parse arguments'''
+ description = '''
+ Take two fasta files and perform pairwise-alignment
+ on each sequence pair.
+
+ Usage:
+ python align_sequence_pairs.py query.fasta target.fasta
+
+ Output:
+ Returns aligned sequence pairs to stdout
+ '''
+ parser = ArgumentParser(description=description)
+ parser.add_argument('query_fasta',
+ type=str,
+ help='Input query FASTA file')
+ parser.add_argument('target_fasta',
+ type=str,
+ help='Input target FASTA file')
+ parser.add_argument('--debug',
+ action='store_true',
+ help='Enable debug logging')
+
+ return parser.parse_args()
+
+
+def init_logging(log_filename):
+ '''
+ Initiate logging
+ '''
+ logging.basicConfig(filename=log_filename,
+ level=logging.DEBUG,
+ filemode='w',
+ format='%(asctime)s %(levelname)s - %(message)s',
+ datefmt="%Y-%m-%dT%H:%M:%S%z")
+ logging.info('Align sequence pairs started')
+
+
+def main():
+ '''Main'''
+ args = parse_args()
+
+ if not os.path.exists(args.query_fasta) or \
+ not os.path.exists(args.target_fasta):
+ sys.exit('Error: FASTA file(s) do not exist')
+
+ if args.debug:
+ global debug
+ debug = True
+ init_logging('rotate_{:%Y-%m-%d_%H%M}.log'.format(datetime.now()))
+
+ queries = fx.Fasta(args.query_fasta, build_index=False)
+ targets = fx.Fasta(args.target_fasta, build_index=False)
+
+ target_dict = {}
+ for target in targets:
+ name, seq = target
+ target_dict[name] = seq
+
+ for query in queries:
+ name, seq = query
+
+ if name not in target_dict:
+ logging.warning(f'Query {name} not found in target')
+ continue
+
+ target_seq = target_dict[name]
+ result = edlib.align(seq, target_seq, mode='HW', task='path')
+ result = edlib.getNiceAlignment(result, seq, target_seq)
+
+ print('----------------------------------------')
+ print(f'{name}')
+ print('----------------------------------------')
+ align_len = len(result['query_aligned'])
+ for i in range(0, align_len, BASE_BUFFER_LEN):
+ query_seq = result['query_aligned'][i:i+BASE_BUFFER_LEN]
+ matched_seq = result['matched_aligned'][i:i+BASE_BUFFER_LEN]
+ target_seq = result['target_aligned'][i:i+BASE_BUFFER_LEN]
+ print(f'{name}_QUERY\t{query_seq}')
+ print(f'{name}_MATCH\t{matched_seq}')
+ print(f'{name}_TARGT\t{target_seq}\n')
+
+ if debug:
+ logging.info('Align sequence pairs finished')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/envs/biopython.yaml b/envs/biopython.yaml
@@ -7,4 +7,5 @@ dependencies:
  - python-edlib
  - pandas
  - natsort
+ - pyfastx
 
diff --git a/modules/align_pairs.nf b/modules/align_pairs.nf
@@ -0,0 +1,27 @@
+process AlignPairs {
+ tag "${consensusFasta.getSimpleName()}"
+ label = "AlignPairs"
+
+ publishDir "${params.outdir}/consensus/${sampleName}", mode: 'copy'
+
+ conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+ params.conda_env_location + '/biopython' :
+ projectDir + '/envs/biopython.yaml' }"
+
+ input:
+ tuple val(sampleName), path(consensusFasta)
+ path(reference)
+
+ output:
+ tuple val(sampleName), path("*.txt"), emit: pair_alignments
+
+ script:
+ def prefix = task.ext.prefix ?: consensusFasta.getSimpleName()
+ """
+ if [ -n "\$(gunzip < ${consensusFasta} | head -c 1 | tr '\0\n' __)" ]; then
+ align_pairs.py ${consensusFasta} ${reference} > ${prefix}_ref_align.txt
+ else
+ touch ${prefix}_ref_align.txt
+ fi
+ """
+}
diff --git a/subworkflows/consensus.nf b/subworkflows/consensus.nf
@@ -1,5 +1,6 @@
 include { Racon } from '../modules/racon'
 include { Medaka } from '../modules/medaka'
+include { AlignPairs } from '../modules/align_pairs.nf'
 
 process PrepareForConsensus {
  label = "PrepareForConsensus"
@@ -41,6 +42,11 @@ workflow Consensus {
 
  consensus_sequences = Medaka.out.assembly
 
+ AlignPairs(consensus_sequences, reference)
+
+ pair_alignments = AlignPairs.out.pair_alignments
+
  emit:
  consensus_sequences
+ pair_alignments
 }