Collate counts step

- Rewrote count collation to collate counts across all samples - Added two simulated samples to test multi-sample collation - top-level sample name now carried through to count matrix
WEHIGenomicsRnD · May 18, 2024 · df832bf · df832bf
1 parent 98e8cb3
commit df832bf
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 42 deletions.
diff --git a/.test/data/guides_simulated.fasta b/.test/data/guides_simulated.fasta
diff --git a/.test/data/overhang_simulated_reads.fastq.gz b/.test/data/overhang_simulated_reads.fastq.gz
diff --git a/.test/data/simu_sampleA.fastq.gz b/.test/data/simu_sampleA.fastq.gz
diff --git a/.test/data/simu_sampleB.fastq.gz b/.test/data/simu_sampleB.fastq.gz
diff --git a/.test/overhang_simulation.py b/.test/overhang_simulation.py
@@ -102,10 +102,11 @@ def generate_varied_read(read_id, combination_index, guide_sequences):
         guides_file.write(f">guide_{guide_id}\n{guide_sequence}\n")
 
 # Write the FastQ file
-fastq_filename = "overhang_simulated_reads.fastq"
-with open(fastq_filename, 'w') as fastq_file:
-    for read_id in range(1, NUM_READS + 1):
-        combination_index = (read_id - 1) % len(INDEX_COMBINATIONS)
-        fastq_file.write(generate_varied_read(read_id, combination_index, guide_sequences))
-
-print(f"FastQ file generated: {fastq_filename}")
+for sample in ['A', 'B']:
+    fastq_filename = f"simu_sample{sample}.fastq"
+    with open(fastq_filename, 'w') as fastq_file:
+        for read_id in range(1, NUM_READS + 1):
+            combination_index = (read_id - 1) % len(INDEX_COMBINATIONS)
+            fastq_file.write(generate_varied_read(read_id, combination_index, guide_sequences))
+
+    print(f"FastQ file generated: {fastq_filename}")
diff --git a/bin/collate_counts.py b/bin/collate_counts.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+'''
+Module      : collate_counts
+Description : Collate multiple count files into one sorted table
+Copyright   : (c) WEHI Genomics R&D, 2024
+License     : TBD
+Maintainer  : Marek Cmero
+Portability : POSIX
+'''
+import pandas as pd
+from natsort import index_natsorted, order_by_index
+from argparse import ArgumentParser
+
+
+def parse_args():
+    '''Parse arguments'''
+    description = '''
+        Count guide sequences
+
+        Usage:
+            collate_counts.py <count_files>
+
+        Outputs a table of merged counts
+        '''
+    parser = ArgumentParser(description=description)
+    parser.add_argument('count_files',
+                        nargs='+',
+                        type=str,
+                        help='Count files to merge')
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    result_df = pd.DataFrame()
+    for count_file in args.count_files:
+        df = pd.read_csv(count_file, sep='\t')
+
+        if len(result_df) == 0:
+            result_df = df
+        else:
+            result_df = pd.merge(result_df, df, how='outer', on='guide')
+
+    new_index = order_by_index(result_df.index, index_natsorted(df['guide']))
+    result_df = result_df.reindex(index=new_index)
+    print(result_df.to_string(index=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/count_guides.py b/bin/count_guides.py
@@ -32,6 +32,9 @@ def parse_args():
     parser.add_argument('guide_reference',
                         type=str,
                         help='Guide reference fasta file.')
+    parser.add_argument('sample',
+                        type=str,
+                        help='Sample name.')
     parser.add_argument('--lenient',
                         action='store_true',
                         help='Count partial mappings.')
@@ -80,7 +83,7 @@ def main():
             print(f'Guide {read.reference_name} does not span the whole guide',
                   file=sys.stderr)
 
-    sample_name = os.path.basename(args.bam).split(".")[0]
+    sample_name = args.sample + "_" + os.path.basename(args.bam).split(".")[0]
     print(f'guide\t{sample_name}')
     for guide, count in counts.items():
         print(f'{guide}\t{count}')

diff --git a/envs/biopython.yaml b/envs/biopython.yaml
@@ -5,4 +5,6 @@ channels:
 dependencies:
   - biopython
   - python-edlib
+  - pandas
+  - natsort
 
diff --git a/main.nf b/main.nf
@@ -16,6 +16,7 @@ include { CreateConfigFile } from './modules/demux.nf'
 include { SplitCode } from './modules/demux.nf'
 include { IndexGuides } from './modules/count.nf'
 include { CountGuides } from './modules/count.nf'
+include { CollateCounts } from './modules/count.nf'
 if (params.use_db) {
     include { fromQuery } from 'plugin/nf-sqldb'
 }
@@ -110,6 +111,7 @@ workflow {
     if (params.guides_fasta != null && params.guides_fasta != '') {
         def guidesIndex = file(params.guides_fasta).getSimpleName() + ".mmi"
         IndexGuides(params.guides_fasta).set{index_ch}
-        CountGuides(index_ch.done, demux_ch, file("${params.outdir}/${guidesIndex}"))
+        CountGuides(index_ch.done, demux_ch, file("${params.outdir}/${guidesIndex}")).set{count_ch}
+        CollateCounts(count_ch.counts.collect())
     }
 }
diff --git a/modules/count.nf b/modules/count.nf
@@ -38,18 +38,11 @@ process CountGuides {
 
     output:
     path "*.bam*"
-    path "*.txt"
+    path "*.txt", emit: counts
 
     script:
-    def outCounts = "${sampleName}_guide_counts.txt"
     def lenientFlag = params.lenient_counts ? "--lenient" : ""
     def extraThreads = task.cpus - 1
-    /*
-    count collation is a hacky bash script to get collated output,
-    the script pastes the count files together, and then cuts out
-    the relevant count fields (keep the first as a label column)
-    we will want to make this into a nice python script later
-    */
     """
     for fastq in ${fastqs};
     do
@@ -60,12 +53,28 @@ process CountGuides {
             samtools view -S -b -@ ${extraThreads} | \
             samtools sort -@ ${extraThreads} -o \${sample}.bam
 
-        count_guides.py \${sample}.bam ${params.guides_fasta} ${lenientFlag} > \${sample}_counts.txt
+        count_guides.py \${sample}.bam ${params.guides_fasta} ${sampleName} ${lenientFlag} > ${sampleName}_\${sample}_counts.txt
     done
+    """
+}
+
+process CollateCounts {
+    label = "CollateCounts"
+
+    publishDir "${params.outdir}/count", mode: 'copy'
+
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/biopython' :
+              projectDir + '/envs/biopython.yaml' }"
 
-    paste *_counts.txt > tmpfile
-    ncounts=\$(expr \$(ls *_counts.txt | wc -l | xargs) \\* 2)
-    fields_to_cut=\$(echo 1 \$(seq 2 2 \$ncounts) | sed 's/ /,/g')
-    cut -f \$fields_to_cut tmpfile > ${outCounts}
+    input:
+    path counts
+
+    output:
+    path "collated_counts.txt"
+
+    script:
+    """
+    collate_counts.py ${counts} > collated_counts.txt
     """
 }