Custom counting script

- Replaced samtools with python script for counting, this script checks that the reads span the entire insert region before counting - Update conda env to handle counting script - Bash script collates counts into one text file - Minor improvements to documentation
WEHIGenomicsRnD · May 3, 2024 · 826be29 · 826be29
1 parent 3d52ed4
commit 826be29
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,11 @@
 # Nanopore Overhang Preprocess
 
-A Nextflow pipeline for the preprocessing of reads using the overhang protocol run on Oxford Nanopore Technology. The pipeline locates the specified forward and reverse primer sequences and trims the read N bases upstream of the forward primer and N bases downstream of the reverse primer. The 5' and 3' ends will then contain index information ready for demultiplexing.
+A Nextflow pipeline for the preprocessing of reads using the overhang protocol run on Oxford Nanopore Technology. The pipeline locates the specified forward and reverse primer sequences and trims the read N bases upstream of the forward primer and N bases downstream of the reverse primer, where N is the barcode length. The pipeline then performs demultiplexing according to the provided barcodes and counts guide sequences if a fasta file of sequences is provided to count against.
+
+The read structure is typically:
+
+`[sequence][fwd_index][fwd_primer][sequence_of_interest][rev_primer][rev_index][sequence]`
 
-The read structure is typically `[sequence][fwd_index][fwd_primer][sequence_of_interest][rev_primer][rev_index][sequence]`. The pipeline trims off the 5' and 3' superfluous 'sequence'. 
 
 ## How to install
 
@@ -27,6 +30,7 @@ nextflow run main.nf --input_dir $path_to_fastqs \
     --rev_primer GTCTGATCGTGCTGCTGAT \
     --mismatches 3 \
     --barcode_length 12 \
+    --guides_fasta $guides_fasta \
     -profile log,milton
 ```
 
@@ -40,4 +44,5 @@ Here are the parameters you will need to set:
 - `--rev_primer`: reverse primer sequence.
 - `--mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib).
 - `--barcode_length`: how many bases to trim to the left and right of the primer sequences. If your barcode includes spacers make sure to take that into account (i.e., non-informative bases between the index and primer).
+- `--guides_fasta`: (optional) fasta file contains guide sequences to count.
 - `--log`: enables logging, which writes the IDs of failed reads to a log file and specifies why each read could not be processed.
diff --git a/bin/count_guides.py b/bin/count_guides.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+'''
+Module      : count_guides
+Description : Count guide sequences from aligned bam,
+              ensuring that read spans the whole guide
+Copyright   : (c) WEHI Genomics R&D, 2024
+License     : TBD
+Maintainer  : Marek Cmero
+Portability : POSIX
+'''
+import os
+import sys
+import pysam
+from Bio import SeqIO
+from argparse import ArgumentParser
+
+
+def parse_args():
+    '''Parse arguments'''
+    description = '''
+        Count guide sequences
+
+        Usage:
+            count_guides.py <bam> <guide_reference>
+
+        Outputs a count table of guide sequences from aligned bam.
+        '''
+    parser = ArgumentParser(description=description)
+    parser.add_argument('bam',
+                        type=str,
+                        help='Reads in fastq format.')
+    parser.add_argument('guide_reference',
+                        type=str,
+                        help='Guide reference fasta file.')
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if not os.path.exists(args.bam):
+        print(f'{args.bam} does not exist', file=sys.stderr)
+        sys.exit(1)
+
+    if not os.path.exists(args.guide_reference):
+        print(f'{args.guide_reference} does not exist', file=sys.stderr)
+        sys.exit(1)
+
+    # create a reference look up for guie lengths
+    guide_lens = {}
+    with open(args.guide_reference) as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            guide_lens[record.id] = len(record.seq)
+
+    counts = {'unmapped': 0, 'partial_map': 0}
+    for guide in guide_lens:
+        counts[guide] = 0
+
+    bamfile = pysam.AlignmentFile(args.bam, 'r')
+    for read in bamfile:
+        if read.is_unmapped:
+            counts['unmapped'] += 1
+            continue
+
+        if read.reference_name not in guide_lens:
+            print(f'Guide {read.reference_name} not found in reference',
+                  file=sys.stderr)
+            continue
+
+        guide_len = guide_lens[read.reference_name]
+        if read.reference_end - read.reference_start != guide_len:
+            counts['partial_map'] += 1
+            print(f'Guide {read.reference_name} does not span the whole guide',
+                  file=sys.stderr)
+
+        counts[read.reference_name] += 1
+
+    sample_name = os.path.basename(args.bam).split(".")[0]
+    print(f'guide\t{sample_name}')
+    for guide, count in counts.items():
+        print(f'{guide}\t{count}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/envs/biopython.yaml b/envs/biopython.yaml
@@ -4,4 +4,5 @@ channels:
   - bioconda
 dependencies:
   - biopython
-  - python-edlib
+  - python-edlib
+
diff --git a/envs/minimap-samtools.yaml b/envs/minimap-samtools.yaml
@@ -4,4 +4,7 @@ channels:
   - bioconda
 dependencies:
   - minimap2
-  - samtools
+  - samtools
+  - biopython
+  - pysam
+
diff --git a/modules/count.nf b/modules/count.nf
@@ -42,6 +42,12 @@ process CountGuides {
 
     script:
     def outCounts = "${sampleName}_guide_counts.txt"
+    /*
+    count collation is a hacky bash script to get collated output,
+    the script pastes the count files together, and then cuts out
+    the relevant count fields (keep the first as a label column)
+    we will want to make this into a nice python script later
+    */
     """
     for fastq in ${fastqs};
     do
@@ -52,9 +58,12 @@ process CountGuides {
             samtools view -S -b | \
             samtools sort -o \${sample}.bam
 
-        samtools index \${sample}.bam
-
-        samtools idxstats \${sample}.bam > \${sample}_counts.txt
+        count_guides.py \${sample}.bam ${params.guides_fasta} > \${sample}_counts.txt
     done
+
+    paste *_counts.txt > tmpfile
+    ncounts=\$(expr \$(ls *_counts.txt | wc -l | xargs) \\* 2)
+    fields_to_cut=\$(echo 1 \$(seq 2 2 \$ncounts) | sed 's/ /,/g')
+    cut -f \$fields_to_cut tmpfile > ${outCounts}
     """
-}
+}