Skip to content

Commit

Permalink
Merge pull request #101 from shjenkins94/add_gene_name
Browse files Browse the repository at this point in the history
add gene_prefix argument
  • Loading branch information
kirilenkobm authored Sep 30, 2023
2 parents 890827e + 01fe736 commit 6638de5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
19 changes: 13 additions & 6 deletions modules/make_query_isoforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
MODULE_NAME_FOR_LOG = "make_query_isoforms"
TOGA_GENE_PREFIX = "TOGA"


def parse_args():
"""Read CMD args."""
app = argparse.ArgumentParser()
Expand All @@ -47,6 +46,12 @@ def parse_args():
help="Disable color filter",
)
app.add_argument("--log_file", help="Log file")
app.add_argument(
"--gene_prefix",
"--gp",
default="TOGA",
help="Prefix to use for query gene identifiers. Default value is TOGA",
)
if len(sys.argv) < 3:
app.print_help()
sys.exit(0)
Expand Down Expand Up @@ -202,7 +207,7 @@ def intersect_exons(chr_dir_exons, exon_id_to_transcript):
return G


def parse_components(components, trans_to_range):
def parse_components(components, trans_to_range, gene_prefix=None):
"""Get genes data.
Each gene has the following data:
Expand All @@ -211,9 +216,10 @@ def parse_components(components, trans_to_range):
3) Genomic range.
"""
to_log(f"{MODULE_NAME_FOR_LOG}: parsing components data to identify query genes")
gp = TOGA_GENE_PREFIX if gene_prefix is None else gene_prefix
genes_data = [] # save gene objects here
for num, component in enumerate(components, 1):
gene_id = f"{TOGA_GENE_PREFIX}_{num}" # need to name them somehow
gene_id = f"{gp}_{num:011}" # need to name them somehow
# get transcripts and their ranges
transcripts = set(component.nodes())
regions = [trans_to_range[t] for t in transcripts]
Expand Down Expand Up @@ -266,7 +272,7 @@ def save_regions(genes_data, output):


def get_query_isoforms_data(
query_bed, query_isoforms, save_genes_track=None, ignore_color=False
query_bed, query_isoforms, save_genes_track=None, ignore_color=False, gene_prefix=None,
):
"""Create isoforms track for query."""
to_log(f"{MODULE_NAME_FOR_LOG}: inferring genes from annotated isoforms in the query")
Expand All @@ -291,7 +297,7 @@ def get_query_isoforms_data(
components = get_graph_components(conn_graph)
to_log(f"{MODULE_NAME_FOR_LOG}: identified {len(components)} connected components in the graph")
# covert components to isoforms table
genes_data = parse_components(components, trans_to_range)
genes_data = parse_components(components, trans_to_range, gene_prefix)
# save the results
save_isoforms(genes_data, query_isoforms)
save_regions(genes_data, save_genes_track)
Expand All @@ -305,4 +311,5 @@ def get_query_isoforms_data(
args.output,
save_genes_track=args.genes_track,
ignore_color=args.ignore_color,
)
gene_prefix=args.gene_prefix,
)
8 changes: 8 additions & 0 deletions toga.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def __init__(self, args):
)

# mics things
self.gene_prefix = args.gene_prefix
self.isoforms_arg = args.isoforms if args.isoforms else None
self.isoforms = None # will be assigned after completeness check
self.chain_jobs = args.chain_jobs_num
Expand Down Expand Up @@ -1262,6 +1263,7 @@ def __orthology_type_map(self):
self.query_annotation,
query_isoforms_file,
save_genes_track=query_gene_spans,
gene_prefix=self.gene_prefix,
)
to_log("Calling orthology types mapping step...")
skipped_ref_trans = os.path.join(self.wd, "ref_orphan_transcripts.txt")
Expand Down Expand Up @@ -1435,6 +1437,12 @@ def parse_args():
"the project name from chain filename, which is not recommended."
)
)
app.add_argument(
"--gene_prefix",
"--gp",
default="TOGA",
help="Prefix to use for query gene identifiers. Default value is TOGA",
)
app.add_argument(
"--min_score",
"--msc",
Expand Down

0 comments on commit 6638de5

Please sign in to comment.