Skip to content

Commit

Permalink
Wired in --exclude-relatives arg
Browse files Browse the repository at this point in the history
  • Loading branch information
jggatter committed Dec 22, 2020
1 parent 530b1ae commit e1df2b0
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 25 deletions.
12 changes: 0 additions & 12 deletions .dockerignore

This file was deleted.

25 changes: 18 additions & 7 deletions recover_cdr3s.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This is the second script of the python pipeline.
It is meant to take the filtered reads, find the top V and J alignments,
reconstruct the CDR3 sequences.
Run using python -m reconstruct_tcrs <args>
Run using python -m recover_cdr3s <args>
Requirements:
Python 3.8.5, samtools, pysam
Expand Down Expand Up @@ -48,7 +48,6 @@ def main(args):
# Perform de Bruijn graph sequence assembly on partially mapped reads.
bamdict.build(id_queries, refdict)
log.info("Finished retrieving top V and J alignments for each query.")

log.info(f"Got {len(bamdict.get_barcodes())} barcode(s), "
f"{len(bamdict.get_umis())} UMI(s) and "
f"{len(bamdict.get_reads())} reads total")
Expand All @@ -58,26 +57,25 @@ def main(args):
for umi in barcode.get_umis():
umi.count_regions()
umi.resolve_alignment_identities(refdict)
reads_VJ = umi.get_top_VJ_reads()
reads_VJ = umi.get_top_VJ_reads(args.disclude_relatives)
if len(reads_VJ) >= args.minimum_cdr3s \
and umi.frequency_top_VJ >= args.minimum_frequency:
for read in reads_VJ:
read.get_cdr3()
umi.select_cdr3(reads_VJ)
#barcode.resolve_cdr3s() # TODO: Edit distance on barcode level?
# Should probably weight by freq?

log.info("Finished recovering CDR3 sequences for all VJ reads"
"and recovered top CDR3 for each UMI.")
bamdict.write_cdr3_info(args.worker, args.output_path)
log.info("Writing reference info for alignment tiebreaks.")
bamdict.write_tiebreaks_alignments(args.worker, args.output_path)
bamdict.close()
log.success("Done")
log.success("DONE")

if __name__ == "__main__":

parser = argparse.ArgumentParser(
prog="python -m reconstruct_tcrs", # Would be good to make a command-line alias and set that here
prog="python -m recover_cdr3s", # Would be good to make a command-line alias and set that here
usage="",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=dedent(
Expand Down Expand Up @@ -167,6 +165,19 @@ def main(args):
"to reclaim for CDR3 reconstruction reads which have primary OR high-scoring "
"secondary alignments to top V and J."
)

parser.add_argument(
'-r', "--disclude-relatives",
action="store_true",
help=
"When identifying the VJ combination for a UMI, "
"only take for CDR3 recovery reads whose top VJ is exactly equal "
"to the highest counted segment variants. If not enabled, all reads "
"are taken that match the highest counted segment's root (ex: TRAV24-1 "
"is the top V for the UMI, but TRAV24-2 reads are also gathered for "
"CDR3 recovery). Regardless the top segment's specific variant will be "
"reported in the output information."
)
# TODO: Figure out why verbosity isn't working across modules.
parser.add_argument(
'-v', "--verbosity",
Expand Down
14 changes: 8 additions & 6 deletions tcrgo/bam/umi.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def resolve_alignment_identities(self, refdict: ReferenceDict):
for candidate in read[segment][1:]]):
method = "Root"
read[segment] = read[segment][0]
ref_winner = root+'-?'
ref_winner = root + '-?'
else:
method = "Alphabetical"
read[segment] = read[segment][0]
Expand All @@ -125,15 +125,17 @@ def resolve_alignment_identities(self, refdict: ReferenceDict):
#if read[segment+"_reference"] is None:
#log.verbose(read) # DEBUG

# TODO: Make disclude_relatives an argument
def get_top_VJ_reads(self, include_relatives: bool=True) -> List[Read]:
# TODO: Two dicts to count by root and specific variant.
# If not exclude_relatives, get all reads that match root of top variant
# Regardless, report top variant
def get_top_VJ_reads(self, exclude_relatives: bool=False) -> List[Read]:
counts_VJ = Counter()
reads_VJ = defaultdict(list)
for read in self.get_reads():
if include_relatives:
refs_VJ = (read.top_V_reference.root, read.top_J_reference.root)
else:
if exclude_relatives:
refs_VJ = (read.top_V_reference.name, read.top_J_reference.name)
else:
refs_VJ = (read.top_V_reference.root, read.top_J_reference.root)
reads_VJ[refs_VJ].append(read)
counts_VJ[refs_VJ] += 1
self.top_VJ = max(counts_VJ, key=counts_VJ.get)
Expand Down
1 change: 1 addition & 0 deletions tcrgo/dropseq_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ def repair_bam(dropseq_jar: str, bam_exontagged: str, bam_repaired: str, min_umi
bam_subrepaired = os.path.join(dirname, bam_subrepaired)
stats_synthesis_error = os.path.join(dirname, "stats_synthesis_error.txt")
summary_synthesis_error = os.path.join(dirname, "summary_synthesis_error.txt")
report_substitution_error = os.path.join(dirname, "report_substitution_error.txt")
report_synthesis_error = os.path.join(dirname, "report_synthesis_error.txt")
command = \
f"""
Expand Down

0 comments on commit e1df2b0

Please sign in to comment.