Wired in --exclude-relatives arg

ShalekLab · Dec 22, 2020 · e1df2b0 · e1df2b0
1 parent 530b1ae
commit e1df2b0
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 25 deletions.
diff --git a/.dockerignore b/.dockerignore
diff --git a/recover_cdr3s.py b/recover_cdr3s.py
@@ -2,7 +2,7 @@
 This is the second script of the python pipeline. 
 It is meant to take the filtered reads, find the top V and J alignments,
 reconstruct the CDR3 sequences.
-Run using python -m reconstruct_tcrs <args>
+Run using python -m recover_cdr3s <args>
 
 Requirements:
 	Python 3.8.5, samtools, pysam 
@@ -48,7 +48,6 @@ def main(args):
 	# Perform de Bruijn graph sequence assembly on partially mapped reads.
 	bamdict.build(id_queries, refdict)
 	log.info("Finished retrieving top V and J alignments for each query.")
-
 	log.info(f"Got {len(bamdict.get_barcodes())} barcode(s), "
 			f"{len(bamdict.get_umis())} UMI(s) and "
 			f"{len(bamdict.get_reads())} reads total")
@@ -58,26 +57,25 @@ def main(args):
 		for umi in barcode.get_umis():
 			umi.count_regions()
 			umi.resolve_alignment_identities(refdict)
-			reads_VJ = umi.get_top_VJ_reads()
+			reads_VJ = umi.get_top_VJ_reads(args.disclude_relatives)
 			if len(reads_VJ) >= args.minimum_cdr3s \
 				and umi.frequency_top_VJ >= args.minimum_frequency:
 				for read in reads_VJ:
 					read.get_cdr3()
 				umi.select_cdr3(reads_VJ)
-			#barcode.resolve_cdr3s() # TODO: Edit distance on barcode level?
-			# Should probably weight by freq?
+
 	log.info("Finished recovering CDR3 sequences for all VJ reads"
 		 "and recovered top CDR3 for each UMI.")
 	bamdict.write_cdr3_info(args.worker, args.output_path)
 	log.info("Writing reference info for alignment tiebreaks.")
 	bamdict.write_tiebreaks_alignments(args.worker, args.output_path)
 	bamdict.close()
-	log.success("Done")
+	log.success("DONE")
 
 if __name__ == "__main__":
 
 	parser = argparse.ArgumentParser(
-		prog="python -m reconstruct_tcrs", # Would be good to make a command-line alias and set that here
+		prog="python -m recover_cdr3s", # Would be good to make a command-line alias and set that here
 		usage="",
 		formatter_class=argparse.RawDescriptionHelpFormatter,
 		description=dedent(
@@ -167,6 +165,19 @@ def main(args):
 			"to reclaim for CDR3 reconstruction reads which have primary OR high-scoring " 
 			"secondary alignments to top V and J."
 	)
+
+	parser.add_argument(
+		'-r', "--disclude-relatives",
+		action="store_true",
+		help=
+			"When identifying the VJ combination for a UMI, "
+			"only take for CDR3 recovery reads whose top VJ is exactly equal "
+			"to the highest counted segment variants. If not enabled, all reads "
+			"are taken that match the highest counted segment's root (ex: TRAV24-1 "
+			"is the top V for the UMI, but TRAV24-2 reads are also gathered for "
+			"CDR3 recovery). Regardless the top segment's specific variant will be "
+			"reported in the output information."
+	)
 	# TODO: Figure out why verbosity isn't working across modules.
 	parser.add_argument(
 		'-v', "--verbosity", 

diff --git a/tcrgo/bam/umi.py b/tcrgo/bam/umi.py
@@ -108,7 +108,7 @@ def resolve_alignment_identities(self, refdict: ReferenceDict):
 						for candidate in read[segment][1:]]):
 						method = "Root"
 						read[segment] = read[segment][0]
-						ref_winner = root+'-?'
+						ref_winner = root + '-?'
 					else:
 						method = "Alphabetical"
 						read[segment] = read[segment][0]
@@ -125,15 +125,17 @@ def resolve_alignment_identities(self, refdict: ReferenceDict):
 				#if read[segment+"_reference"] is None: 
 					#log.verbose(read) # DEBUG
 
-	# TODO: Make disclude_relatives an argument
-	def get_top_VJ_reads(self, include_relatives: bool=True) -> List[Read]:
+	# TODO: Two dicts to count by root and specific variant.
+	# If not exclude_relatives, get all reads that match root of top variant
+	# Regardless, report top variant
+	def get_top_VJ_reads(self, exclude_relatives: bool=False) -> List[Read]:
 		counts_VJ = Counter()
 		reads_VJ = defaultdict(list)
 		for read in self.get_reads():
-			if include_relatives:
-				refs_VJ = (read.top_V_reference.root, read.top_J_reference.root)
-			else:
+			if exclude_relatives:
 				refs_VJ = (read.top_V_reference.name, read.top_J_reference.name)
+			else:
+				refs_VJ = (read.top_V_reference.root, read.top_J_reference.root)
 			reads_VJ[refs_VJ].append(read)
 			counts_VJ[refs_VJ] += 1
 		self.top_VJ = max(counts_VJ, key=counts_VJ.get)

diff --git a/tcrgo/dropseq_tools.py b/tcrgo/dropseq_tools.py
@@ -471,6 +471,7 @@ def repair_bam(dropseq_jar: str, bam_exontagged: str, bam_repaired: str, min_umi
 	bam_subrepaired = os.path.join(dirname, bam_subrepaired)
 	stats_synthesis_error = os.path.join(dirname, "stats_synthesis_error.txt")
 	summary_synthesis_error = os.path.join(dirname, "summary_synthesis_error.txt")
+	report_substitution_error = os.path.join(dirname, "report_substitution_error.txt")
 	report_synthesis_error = os.path.join(dirname, "report_synthesis_error.txt")
 	command = \
 		f"""