Updated to V1.1

hillerlab · Oct 24, 2022 · 02b617c · 02b617c
1 parent d6a4116
commit 02b617c
Show file tree

Hide file tree

Showing 21 changed files with 2,287 additions and 487 deletions.
diff --git a/CESAR_wrapper.py b/CESAR_wrapper.py
diff --git a/cesar_runner.py b/cesar_runner.py
@@ -12,6 +12,9 @@
 __credits__ = ["Michael Hiller", "Virag Sharma", "David Jebb"]
 
 MAX_ATTEMPTS = 2
+ZERO_CODE = 0
+ERR_CODE = 1
+FRAGM_CHAIN_ISSUE_CODE = 2
 
 
 def parse_args():
@@ -23,6 +26,7 @@ def parse_args():
         "--check_loss", default=None, help="File to save gene loss data if requested"
     )
     app.add_argument("--rejected_log", default=None, help="Log gene rejection events")
+    app.add_argument("--unproc_log", "--ul", default=None, help="Log unprocessed genes")
     # print help if there are no args
     if len(sys.argv) < 2:
         app.print_help()
@@ -43,13 +47,22 @@ def call_job(cmd):
         rc = p.returncode
         cmd_out = b_stdout.decode("utf-8")
         err_msg = b_stderr.decode("utf-8").replace("\n", " ")
-        if rc == 0:
-            return cmd_out, 0
+        if rc == ZERO_CODE:
+            return cmd_out, ZERO_CODE
+        elif rc == FRAGM_CHAIN_ISSUE_CODE:
+            err_msg = f"CESAR_wrapper.py detected that fragments overlap for {cmd}, abort"
+            return err_msg, FRAGM_CHAIN_ISSUE_CODE
         else:
             eprint(err_msg)
             eprint(f"\n{cmd} FAILED")
             attempts += 1
-    return err_msg, 1  # send failure signal
+    return err_msg, ERR_CODE  # send failure signal
+
+
+def __job_to_transcript(job):
+    """Extract transcript ID from job."""
+    fields = job.split()
+    return fields[1]
 
 
 def main():
@@ -60,6 +73,7 @@ def main():
         # text file, a command per line
         jobs = [x.rstrip() for x in f.readlines()]
     jobs_num = len(jobs)
+    unprocessed_genes = []
 
     out = open(args.output, "w")  # handle output file
     gene_loss_data = []  # list to keep gene loss detector out
@@ -69,6 +83,12 @@ def main():
         eprint(f"Calling:\n{job}")
         # catch job stdout
         job_out, rc = call_job(job)
+        if rc == FRAGM_CHAIN_ISSUE_CODE:
+            # very special case -> nothig we can do
+            # mark as missnig, I guess
+            rejected.append(f"{job}\tfragment chains oevrlap\n")
+            unprocessed_genes.append(__job_to_transcript(job))
+            continue
         if rc == 1:
             # a job failed with code 1 -> send the signal upstream
             # abort execution, write what job exactly failed
@@ -136,6 +156,11 @@ def main():
         f.write("".join(rejected))
         f.close()
 
+    if args.unproc_log and len(unprocessed_genes) > 0:
+        f = open(args.unproc_log, "w")
+        for elem in unprocessed_genes:
+            f.write(f"{elem}\n")
+        f.close()
 
 if __name__ == "__main__":
     main()
diff --git a/chain_runner.py b/chain_runner.py
@@ -560,12 +560,14 @@ def main():
     # 2) an argument: "chain ,-sep list of genes"
     batch = read_input(args.input_file)
     task_size = len(batch)
+    # TODO: check whether I don't need .bst
     # load chains dict; it would be much faster to load chain_ID: (start_byte, offset)
     # python dict once than ask HDF5 database each time TOGA needs another chain
     index_file = args.chain_file.replace(".chain", ".chain_ID_position")
     chain_dict = load_chain_dict(index_file)
 
     # call main processing tool
+    # TODO: rename genes to transcripts where appropropriate
     for job_num, (chain, genes) in enumerate(batch.items(), 1):
         # one unit: one chain + intersected genes
         # call routine that extracts chain feature

diff --git a/modules/GLP_values.py b/modules/GLP_values.py
@@ -6,7 +6,12 @@
 DEL_MISS = {MISS_EXON, DEL_EXON}
 COMPENSATION = "COMPENSATION"
 SSM = "SSM"
+# (ag)acceptor-EXON-donor(gt)
+SSM_D = "SSMD"  # Donor, right, GT,GC
+SSM_A = "SSMA"  # Acceptor, left, AG 
+
 START_MISSING = "START_MISSING"
+ATG = "ATG"
 FS_DEL = "FS_DEL"
 FS_INS = "FS_INS"
 BIG_DEL = "BIG_DEL"

diff --git a/modules/classify_chains.py b/modules/classify_chains.py
@@ -91,6 +91,7 @@ def classify_chains(
     # -> then this is a proc pseudogene
 
     # move trans chains to a different dataframe
+    # TODO: rename trans -> spanning
     # trans chain -> a syntenic chain that passes throw the gene body
     #                but has no aligning bases in the CDS
     trans_lines = df[(df["exon_cover"] == 0) & (df["synt"] > 1)]

diff --git a/modules/gene_losses_summary.py b/modules/gene_losses_summary.py
@@ -144,13 +144,13 @@ def read_loss_data(loss_dir):
                 perc = float(line_data[2].split()[1])
                 projection_to_p_intact_M_intact[projection_id] = perc
                 continue
-            elif line_data[2].startswith("MIDDLE_80%_INTACT"):
-                # flag: are there inact mutations in the middle 80% of CDS?
+            elif line_data[2].startswith("MIDDLE_IS_INTACT"):
+                # flag: are there inact mutations in the first 90%/mid 80% of CDS?
                 raw_val = line_data[2].split()[1]
                 val = True if raw_val == "TRUE" else False
                 proj_to_80_p_intact[projection_id] = val
                 continue
-            elif line_data[2].startswith("MIDDLE_80%_PRESENT"):
+            elif line_data[2].startswith("MIDDLE_IS_PRESENT"):
                 # flag: any missing fragment in the middle 80% if CDS?
                 raw_val = line_data[2].split()[1]
                 val = True if raw_val == "TRUE" else False
@@ -246,6 +246,7 @@ def get_projection_classes(
         p_intact_M_ign = p_to_pint_m_ign.get(projection, -1)
         p_intact_M_int = p_to_pint_m_int.get(projection, -1)
         p_i_codons = p_to_i_codon_prop.get(projection, -1)
+        # TODO: rename to NO LOSS IN FIRST 90%
         no_loss_in_80_p = p_80_int.get(projection, None)
         m_80_present = p_80_pre.get(projection, None)
         frame_oub = p_to_p_out_of_bord.get(projection, 0.0)