Skip to content

Commit

Permalink
Cleaned code and made fixes for alpha 2
Browse files Browse the repository at this point in the history
  • Loading branch information
jggatter committed Jan 7, 2021
1 parent e1df2b0 commit f3b2f10
Show file tree
Hide file tree
Showing 13 changed files with 266 additions and 968 deletions.
12 changes: 4 additions & 8 deletions filter_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
Run using python -m filter_queries <args>
Requirements:
Python 3.8.5, samtools, pysam
Python >3.8.5, samtools, pysam, biopython, pandas
"""
import argparse
import os.path as osp
import os.path
from textwrap import dedent, indent

from typing import List, Dict, Iterator, Set
Expand All @@ -16,9 +16,7 @@
from tcrgo.bam import BAMDict
import tcrgo.io as io
from tcrgo import Log
log = Log(name=__name__)

# TODO: Seems like there ARE standard tags for cell barcode/UMI. CR and OX/MI
log = Log("root")

def main(args):

Expand All @@ -37,7 +35,7 @@ def main(args):
# Maybe could use pysam.PileUpColumn/PileUp for this?

log.info("Writing all (partially) mapped queries not containing V and J alignments to file.")
io.output_nonVJ(queries_nonVJ, args.output_path)
io.output_queries(queries_nonVJ, os.path.join(args.output_path, "queries_nonVJ.tsv"))
log.info(f"Filtering out UMIs with VJ below {args.minimum_reads} reads and randomly "
f"subsampling those with reads exceeding {args.maximum_reads} reads.")
log.info(f"Will be writing filtered queries containing V and J to {args.workers} file(s).")
Expand All @@ -46,7 +44,6 @@ def main(args):
log.success("Done!")

if __name__ == "__main__":

parser = argparse.ArgumentParser(
prog="python -m filter_queries", # Would be good to make a command-line alias and set that here
usage="",
Expand Down Expand Up @@ -75,7 +72,6 @@ def main(args):
"Path to single-end tagged, trimmed, aligned, repaired BAM. IMPORTANT: "
"It is strongly recommended you use the alignment script to create this BAM."
)

# OPTIONAL ARGUMENTS
parser.add_argument(
'-v', "--verbosity",
Expand Down
62 changes: 34 additions & 28 deletions recover_cdr3s.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""
This is the second script of the python pipeline.
It is meant to take the filtered reads, find the top V and J alignments,
reconstruct the CDR3 sequences.
reconstruct the CDR3 sequences and select the most representative CDR3.
Run using python -m recover_cdr3s <args>
Requirements:
Python 3.8.5, samtools, pysam
Python >3.8.5, samtools, pysam, biopython, pandas
"""
import argparse
import os.path as osp
import os.path
from textwrap import dedent, indent
import pysam

Expand All @@ -18,10 +18,9 @@
from tcrgo.bam import BAMDict, ReferenceDict
import tcrgo.io as io
from tcrgo import Log
log = Log(name=__name__)
log = Log("root")

def main(args):

log.init(args.verbosity)
log.info("Parsing input data...")
bam = io.sort_and_index(args.bam, args.output_path)
Expand All @@ -37,45 +36,53 @@ def main(args):

log.info("Fetching filtered queries containing alignments to both V and J segments.")
if args.query_list is not None:
id_queries = io.read_query_list(args.query_list)
args.worker = io.get_worker_id(args.query_list)
queries_filename = args.query_list
worker = io.get_worker_id(args.query_list)
else:
id_queries = io.read_id_queries(args.input_path, args.worker)
worker = args.worker
queries_filename = os.path.join(args.input_path, f"queries{worker}.tsv")
id_queries = io.read_queries(queries_filename)

bamdict = BAMDict(bam)
log.info("Finding the top V and J alignments for each query")
# TODO: Consider splitting fully-mapped V/J reads and partially-mapped V/J reads into different camps
# Perform de Bruijn graph sequence assembly on partially mapped reads.
bamdict.build(id_queries, refdict)
log.info("Finished retrieving top V and J alignments for each query.")
num_umis = len(bamdict.get_umis())
log.info(f"Got {len(bamdict.get_barcodes())} barcode(s), "
f"{len(bamdict.get_umis())} UMI(s) and "
f"{len(bamdict.get_reads())} reads total")

log.info(f"{len(bamdict.get_umis())}")
f"{num_umis} UMI(s) and "
f"{len(bamdict.get_reads())} read(s) total")

log.info("For each UMI, resolving V and J alignment identities "
"and selecting the most representative CDR3 sequences...")
count = 0
report_interval = max(1, num_umis // 50)
for barcode in bamdict.get_barcodes():
for umi in barcode.get_umis():
umi.count_regions()
umi.resolve_alignment_identities(refdict)
reads_VJ = umi.get_top_VJ_reads(args.disclude_relatives)
reads_VJ = umi.get_top_VJ_reads(args.exclude_relatives)
if len(reads_VJ) >= args.minimum_cdr3s \
and umi.frequency_top_VJ >= args.minimum_frequency:
for read in reads_VJ:
read.get_cdr3()
umi.select_cdr3(reads_VJ)
count += 1
if count % report_interval == 0:
log.info(f"Processed {count} UMIs ({(count / num_umis):.0%}).")

log.info("Finished recovering CDR3 sequences for all VJ reads"
"and recovered top CDR3 for each UMI.")
bamdict.write_cdr3_info(args.worker, args.output_path)
bamdict.write_cdr3_info(worker, args.output_path)
log.info("Writing reference info for alignment tiebreaks.")
bamdict.write_tiebreaks_alignments(args.worker, args.output_path)
bamdict.write_tiebreaks_alignments(worker, args.output_path)
bamdict.close()
log.success("DONE")

if __name__ == "__main__":

parser = argparse.ArgumentParser(
prog="python -m recover_cdr3s", # Would be good to make a command-line alias and set that here
prog="python -m recover_cdr3s", # TODO: Would be good to make a command-line alias and set that here
usage="",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=dedent(
Expand All @@ -88,7 +95,7 @@ def main(args):
),
epilog=dedent(
"""
Developed by James Gatter & Sarah Nyquist of Shalek Lab (2020).
Developed by James Gatter & Sarah Nyquist of Shalek Lab (2020-2021).
Original Matlab pipeline written by Andy Tu and Tod Gierhann of Love Lab (2019).
"""
)
Expand Down Expand Up @@ -157,17 +164,16 @@ def main(args):
)
# TODO: This never happened. Perhaps repurpose for an additional check on the barcode level?
# Ask Sarah if that's needed.
#parser.add_argument(
# '-s', "--second-pass",
# action="store_true",
# help=
# "After a top VJ combination is found, revisit non-top-VJ-combination reads "
# "to reclaim for CDR3 reconstruction reads which have primary OR high-scoring "
# "secondary alignments to top V and J."
#)
parser.add_argument(
'-s', "--second-pass",
action="store_true",
help=
"After a top VJ combination is found, revisit non-top-VJ-combination reads "
"to reclaim for CDR3 reconstruction reads which have primary OR high-scoring "
"secondary alignments to top V and J."
)

parser.add_argument(
'-r', "--disclude-relatives",
'-r', "--exclude-relatives",
action="store_true",
help=
"When identifying the VJ combination for a UMI, "
Expand Down
14 changes: 8 additions & 6 deletions summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
Run using python -m summary <args>
Requirements:
Python 3.8.5, samtools, pysam, pandas
Python >3.8.5, samtools, pysam, biopython, pandas
"""
import argparse
from pathlib import Path
import tcrgo.io as io
from textwrap import dedent, indent

from tcrgo import Log
log = Log(name=__name__)
log = Log('root')

def main(args):
log.init(args.verbosity)
Expand All @@ -27,11 +27,12 @@ def main(args):
worker_range = range(worker_range[0], worker_range[-1]+1)
else:
worker_range = range(1, worker_range[-1]+1)

aggregated_cdr3_info = io.read_cdr3_info(worker_range, args.input_path, args.string_index)
io.write_dataframe(aggregated_cdr3_info, args.output_path, "aggregated_cdr3_info.tsv")
aggregated_tiebreaks_alignments = io.read_tiebreaks_alignments(worker_range, args.input_path)
io.write_dataframe(aggregated_tiebreaks_alignments, args.output_path, "aggregated_tiebreaks_alignments.tsv")
log.success("DONE")

if __name__ == "__main__":

Expand Down Expand Up @@ -59,10 +60,11 @@ def main(args):
parser.add_argument(
"workers",
type=str,
metavar="<NUMWORKERS or NUMFIRST:NUMLAST>",
metavar="<'ALL' or NUMWORKERS or NUMFIRST:NUMLAST>",
help=
"The number or range [NUMFIRST, NUMLAST] of reconstruct_tcrs text "
"files to be concatenated for summary. NUMFIRST must be > 0."
"The number or range [NUMFIRST, NUMLAST] of recover_cdr3s text "
"files to be concatenated for summary. NUMFIRST must be > 0. ALL "
"will automatically detect all files in the input directory."
)
# OPTIONAL ARGUMENTS
parser.add_argument(
Expand Down
Loading

0 comments on commit f3b2f10

Please sign in to comment.