Skip to content

Commit

Permalink
Updated to V1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
kirilenkobm committed Oct 24, 2022
1 parent d6a4116 commit 02b617c
Show file tree
Hide file tree
Showing 21 changed files with 2,287 additions and 487 deletions.
232 changes: 203 additions & 29 deletions CESAR_wrapper.py

Large diffs are not rendered by default.

31 changes: 28 additions & 3 deletions cesar_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
__credits__ = ["Michael Hiller", "Virag Sharma", "David Jebb"]

MAX_ATTEMPTS = 2
ZERO_CODE = 0
ERR_CODE = 1
FRAGM_CHAIN_ISSUE_CODE = 2


def parse_args():
Expand All @@ -23,6 +26,7 @@ def parse_args():
"--check_loss", default=None, help="File to save gene loss data if requested"
)
app.add_argument("--rejected_log", default=None, help="Log gene rejection events")
app.add_argument("--unproc_log", "--ul", default=None, help="Log unprocessed genes")
# print help if there are no args
if len(sys.argv) < 2:
app.print_help()
Expand All @@ -43,13 +47,22 @@ def call_job(cmd):
rc = p.returncode
cmd_out = b_stdout.decode("utf-8")
err_msg = b_stderr.decode("utf-8").replace("\n", " ")
if rc == 0:
return cmd_out, 0
if rc == ZERO_CODE:
return cmd_out, ZERO_CODE
elif rc == FRAGM_CHAIN_ISSUE_CODE:
err_msg = f"CESAR_wrapper.py detected that fragments overlap for {cmd}, abort"
return err_msg, FRAGM_CHAIN_ISSUE_CODE
else:
eprint(err_msg)
eprint(f"\n{cmd} FAILED")
attempts += 1
return err_msg, 1 # send failure signal
return err_msg, ERR_CODE # send failure signal


def __job_to_transcript(job):
"""Extract transcript ID from job."""
fields = job.split()
return fields[1]


def main():
Expand All @@ -60,6 +73,7 @@ def main():
# text file, a command per line
jobs = [x.rstrip() for x in f.readlines()]
jobs_num = len(jobs)
unprocessed_genes = []

out = open(args.output, "w") # handle output file
gene_loss_data = [] # list to keep gene loss detector out
Expand All @@ -69,6 +83,12 @@ def main():
eprint(f"Calling:\n{job}")
# catch job stdout
job_out, rc = call_job(job)
if rc == FRAGM_CHAIN_ISSUE_CODE:
# very special case -> nothig we can do
# mark as missnig, I guess
rejected.append(f"{job}\tfragment chains oevrlap\n")
unprocessed_genes.append(__job_to_transcript(job))
continue
if rc == 1:
# a job failed with code 1 -> send the signal upstream
# abort execution, write what job exactly failed
Expand Down Expand Up @@ -136,6 +156,11 @@ def main():
f.write("".join(rejected))
f.close()

if args.unproc_log and len(unprocessed_genes) > 0:
f = open(args.unproc_log, "w")
for elem in unprocessed_genes:
f.write(f"{elem}\n")
f.close()

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions chain_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,12 +560,14 @@ def main():
# 2) an argument: "chain ,-sep list of genes"
batch = read_input(args.input_file)
task_size = len(batch)
# TODO: check whether I don't need .bst
# load chains dict; it would be much faster to load chain_ID: (start_byte, offset)
# python dict once than ask HDF5 database each time TOGA needs another chain
index_file = args.chain_file.replace(".chain", ".chain_ID_position")
chain_dict = load_chain_dict(index_file)

# call main processing tool
# TODO: rename genes to transcripts where appropropriate
for job_num, (chain, genes) in enumerate(batch.items(), 1):
# one unit: one chain + intersected genes
# call routine that extracts chain feature
Expand Down
5 changes: 5 additions & 0 deletions modules/GLP_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
DEL_MISS = {MISS_EXON, DEL_EXON}
COMPENSATION = "COMPENSATION"
SSM = "SSM"
# (ag)acceptor-EXON-donor(gt)
SSM_D = "SSMD" # Donor, right, GT,GC
SSM_A = "SSMA" # Acceptor, left, AG

START_MISSING = "START_MISSING"
ATG = "ATG"
FS_DEL = "FS_DEL"
FS_INS = "FS_INS"
BIG_DEL = "BIG_DEL"
Expand Down
1 change: 1 addition & 0 deletions modules/classify_chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def classify_chains(
# -> then this is a proc pseudogene

# move trans chains to a different dataframe
# TODO: rename trans -> spanning
# trans chain -> a syntenic chain that passes throw the gene body
# but has no aligning bases in the CDS
trans_lines = df[(df["exon_cover"] == 0) & (df["synt"] > 1)]
Expand Down
7 changes: 4 additions & 3 deletions modules/gene_losses_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,13 @@ def read_loss_data(loss_dir):
perc = float(line_data[2].split()[1])
projection_to_p_intact_M_intact[projection_id] = perc
continue
elif line_data[2].startswith("MIDDLE_80%_INTACT"):
# flag: are there inact mutations in the middle 80% of CDS?
elif line_data[2].startswith("MIDDLE_IS_INTACT"):
# flag: are there inact mutations in the first 90%/mid 80% of CDS?
raw_val = line_data[2].split()[1]
val = True if raw_val == "TRUE" else False
proj_to_80_p_intact[projection_id] = val
continue
elif line_data[2].startswith("MIDDLE_80%_PRESENT"):
elif line_data[2].startswith("MIDDLE_IS_PRESENT"):
# flag: any missing fragment in the middle 80% if CDS?
raw_val = line_data[2].split()[1]
val = True if raw_val == "TRUE" else False
Expand Down Expand Up @@ -246,6 +246,7 @@ def get_projection_classes(
p_intact_M_ign = p_to_pint_m_ign.get(projection, -1)
p_intact_M_int = p_to_pint_m_int.get(projection, -1)
p_i_codons = p_to_i_codon_prop.get(projection, -1)
# TODO: rename to NO LOSS IN FIRST 90%
no_loss_in_80_p = p_80_int.get(projection, None)
m_80_present = p_80_pre.get(projection, None)
frame_oub = p_to_p_out_of_bord.get(projection, 0.0)
Expand Down
Loading

0 comments on commit 02b617c

Please sign in to comment.