Skip to content

Commit

Permalink
Collate counts step
Browse files Browse the repository at this point in the history
- Rewrote count collation to collate counts across
  all samples
- Added two simulated samples to test multi-sample
  collation
- top-level sample name now carried through to count
  matrix
  • Loading branch information
mcmero committed May 18, 2024
1 parent 98e8cb3 commit df832bf
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 42 deletions.
40 changes: 20 additions & 20 deletions .test/data/guides_simulated.fasta

Large diffs are not rendered by default.

Binary file removed .test/data/overhang_simulated_reads.fastq.gz
Binary file not shown.
Binary file added .test/data/simu_sampleA.fastq.gz
Binary file not shown.
Binary file added .test/data/simu_sampleB.fastq.gz
Binary file not shown.
15 changes: 8 additions & 7 deletions .test/overhang_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,11 @@ def generate_varied_read(read_id, combination_index, guide_sequences):
guides_file.write(f">guide_{guide_id}\n{guide_sequence}\n")

# Write the FastQ file
fastq_filename = "overhang_simulated_reads.fastq"
with open(fastq_filename, 'w') as fastq_file:
for read_id in range(1, NUM_READS + 1):
combination_index = (read_id - 1) % len(INDEX_COMBINATIONS)
fastq_file.write(generate_varied_read(read_id, combination_index, guide_sequences))

print(f"FastQ file generated: {fastq_filename}")
for sample in ['A', 'B']:
fastq_filename = f"simu_sample{sample}.fastq"
with open(fastq_filename, 'w') as fastq_file:
for read_id in range(1, NUM_READS + 1):
combination_index = (read_id - 1) % len(INDEX_COMBINATIONS)
fastq_file.write(generate_varied_read(read_id, combination_index, guide_sequences))

print(f"FastQ file generated: {fastq_filename}")
52 changes: 52 additions & 0 deletions bin/collate_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python
'''
Module : collate_counts
Description : Collate multiple count files into one sorted table
Copyright : (c) WEHI Genomics R&D, 2024
License : TBD
Maintainer : Marek Cmero
Portability : POSIX
'''
import pandas as pd
from natsort import index_natsorted, order_by_index
from argparse import ArgumentParser


def parse_args():
'''Parse arguments'''
description = '''
Count guide sequences
Usage:
collate_counts.py <count_files>
Outputs a table of merged counts
'''
parser = ArgumentParser(description=description)
parser.add_argument('count_files',
nargs='+',
type=str,
help='Count files to merge')

return parser.parse_args()


def main():
args = parse_args()

result_df = pd.DataFrame()
for count_file in args.count_files:
df = pd.read_csv(count_file, sep='\t')

if len(result_df) == 0:
result_df = df
else:
result_df = pd.merge(result_df, df, how='outer', on='guide')

new_index = order_by_index(result_df.index, index_natsorted(df['guide']))
result_df = result_df.reindex(index=new_index)
print(result_df.to_string(index=False))


if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion bin/count_guides.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ def parse_args():
parser.add_argument('guide_reference',
type=str,
help='Guide reference fasta file.')
parser.add_argument('sample',
type=str,
help='Sample name.')
parser.add_argument('--lenient',
action='store_true',
help='Count partial mappings.')
Expand Down Expand Up @@ -80,7 +83,7 @@ def main():
print(f'Guide {read.reference_name} does not span the whole guide',
file=sys.stderr)

sample_name = os.path.basename(args.bam).split(".")[0]
sample_name = args.sample + "_" + os.path.basename(args.bam).split(".")[0]
print(f'guide\t{sample_name}')
for guide, count in counts.items():
print(f'{guide}\t{count}')
Expand Down
2 changes: 2 additions & 0 deletions envs/biopython.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ channels:
dependencies:
- biopython
- python-edlib
- pandas
- natsort

4 changes: 3 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ include { CreateConfigFile } from './modules/demux.nf'
include { SplitCode } from './modules/demux.nf'
include { IndexGuides } from './modules/count.nf'
include { CountGuides } from './modules/count.nf'
include { CollateCounts } from './modules/count.nf'
if (params.use_db) {
include { fromQuery } from 'plugin/nf-sqldb'
}
Expand Down Expand Up @@ -110,6 +111,7 @@ workflow {
if (params.guides_fasta != null && params.guides_fasta != '') {
def guidesIndex = file(params.guides_fasta).getSimpleName() + ".mmi"
IndexGuides(params.guides_fasta).set{index_ch}
CountGuides(index_ch.done, demux_ch, file("${params.outdir}/${guidesIndex}"))
CountGuides(index_ch.done, demux_ch, file("${params.outdir}/${guidesIndex}")).set{count_ch}
CollateCounts(count_ch.counts.collect())
}
}
35 changes: 22 additions & 13 deletions modules/count.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,11 @@ process CountGuides {

output:
path "*.bam*"
path "*.txt"
path "*.txt", emit: counts

script:
def outCounts = "${sampleName}_guide_counts.txt"
def lenientFlag = params.lenient_counts ? "--lenient" : ""
def extraThreads = task.cpus - 1
/*
count collation is a hacky bash script to get collated output,
the script pastes the count files together, and then cuts out
the relevant count fields (keep the first as a label column)
we will want to make this into a nice python script later
*/
"""
for fastq in ${fastqs};
do
Expand All @@ -60,12 +53,28 @@ process CountGuides {
samtools view -S -b -@ ${extraThreads} | \
samtools sort -@ ${extraThreads} -o \${sample}.bam
count_guides.py \${sample}.bam ${params.guides_fasta} ${lenientFlag} > \${sample}_counts.txt
count_guides.py \${sample}.bam ${params.guides_fasta} ${sampleName} ${lenientFlag} > ${sampleName}_\${sample}_counts.txt
done
"""
}

process CollateCounts {
label = "CollateCounts"

publishDir "${params.outdir}/count", mode: 'copy'

conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
params.conda_env_location + '/biopython' :
projectDir + '/envs/biopython.yaml' }"

paste *_counts.txt > tmpfile
ncounts=\$(expr \$(ls *_counts.txt | wc -l | xargs) \\* 2)
fields_to_cut=\$(echo 1 \$(seq 2 2 \$ncounts) | sed 's/ /,/g')
cut -f \$fields_to_cut tmpfile > ${outCounts}
input:
path counts

output:
path "collated_counts.txt"

script:
"""
collate_counts.py ${counts} > collated_counts.txt
"""
}

0 comments on commit df832bf

Please sign in to comment.