Skip to content

Commit

Permalink
Options for dealing with tax information
Browse files Browse the repository at this point in the history
Setting "--include-missing-functions" represents organisms that are not in KEGG Genomes
Setting "--map-all" ignores KEGG Genomes completely, and represents all functions identified, regardless of information that the function is not available for the organism
closes #11
  • Loading branch information
iquasere committed May 26, 2023
1 parent 9e914e9 commit f01076e
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 22 deletions.
60 changes: 41 additions & 19 deletions keggcharter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from keggpathway_map import KEGGPathwayMap, expand_by_list_column

__version__ = "0.5.1"
__version__ = "0.6.0"


def get_arguments():
Expand All @@ -42,21 +42,29 @@ def get_arguments():
parser.add_argument("-keggc", "--kegg-column", help="Column with KEGG IDs.")
parser.add_argument("-koc", "--ko-column", help="Column with KOs.")
parser.add_argument("-ecc", "--ec-column", help="Column with EC numbers.")
# TODO - test this argument without UniProt shenanigans
parser.add_argument(
"-tc", "--taxa-column", default='Taxonomic lineage (GENUS)',
help="Column with the taxa designations to represent with KEGGCharter."
" NOTE - for valid taxonomies, check: https://www.genome.jp/kegg/catalog/org_list.html")
parser.add_argument(
"-iq", "--input-quantification", action="store_true",
help="If input table has no quantification, will create a mock quantification column")
parser.add_argument(
"-it", "--input-taxonomy", default=None,
help="If no taxonomy column exists and there is only one taxon in question.")
# TODO - test this argument without UniProt shenanigans
parser.add_argument(
"-tc", "--taxa-column", default='Taxonomic lineage (GENUS)',
help="Column with the taxa designations to represent with KEGGCharter")
"--step", default=40, type=int, help="Number of IDs to submit per request through the KEGG API [40]")
parser.add_argument(
"--map-all", default=False, action="store_true",
help="Ignore KEGG taxonomic information. All functions for all KOs will be represented,"
" even if they aren't attributed by KEGG to the specific species.")
parser.add_argument(
"--include-missing-genomes", default=False, action="store_true",
help="Map the functions for KOs identified for organisms not present in KEGG Genomes.")
parser.add_argument(
"--resume", action="store_true", default=False,
help="If data inputed has already been analyzed by KEGGCharter.")
parser.add_argument(
"--step", default=40, type=int, help="Number of IDs to submit per request through the KEGG API [40]")
parser.add_argument('-v', '--version', action='version', version='KEGGCharter ' + __version__)

required_named = parser.add_argument_group('required named arguments')
Expand Down Expand Up @@ -247,6 +255,10 @@ def kegg_metabolic_maps():


def write_kgml(mmap, output, organism='ko'):
"""
This function is confusing, in that it both writes the KGML, and parses it.
Still, it works, and for now that's enough.
"""
data = kegg_get(f"{organism}{mmap}", "kgml").read()
with open(output, 'w') as f:
if len(data) > 1:
Expand Down Expand Up @@ -320,30 +332,39 @@ def parse_organism(file):
return pd.read_csv(file, sep='\t', usecols=[1, 2], header=None, index_col=1, names=['prefix', 'name'])


def download_resources(data, resources_directory, taxa_column, metabolic_maps):
def download_resources(
data, resources_directory, taxa_column, metabolic_maps, map_all=False, map_non_kegg_genomes=True):
"""
Download all resources for a given dataframe
:param data: pandas.DataFrame - dataframe with taxa names in taxa_column
:param resources_directory: str - directory where to save the resources
:param taxa_column: str - column name in dataframe with taxa names
:param metabolic_maps: list - metabolic maps to download
:param map_all: bool - if True, attribute all maps and all functions to all taxa, only limit by the identifications
:param map_non_kegg_genomes: bool - if True, map non-KEGG genomes to KEGG orthologs
:return: taxon_to_mmap_to_orthologs - dic with taxon name as key and dic with metabolic maps as values
"""
download_organism(resources_directory)
taxa = ['ko'] + list(set(data[data[taxa_column].notnull()][taxa_column]))
taxa_df = parse_organism(f'{resources_directory}/organism')
i = 1
taxon_to_mmap_to_orthologs = {} # {'Keratinibaculum paraultunense' : {'00190': ['1', '2']}}
for taxon in tqdm(taxa, desc=f'Getting information for {len(taxa) - 1} taxa'):
kegg_prefix = taxon2prefix(taxon, taxa_df)
if kegg_prefix is not None:
taxon_mmaps = get_taxon_maps(kegg_prefix)
taxon_mmaps = [mmap for mmap in taxon_mmaps if mmap in metabolic_maps] # select only inputted maps
taxon_to_mmap_to_orthologs[taxon] = write_kgmls(
taxon_mmaps, f'{resources_directory}/kc_kgmls', org=kegg_prefix)
else:
taxon_to_mmap_to_orthologs[taxon] = {}
i += 1
if map_all: # attribute all maps and all functions to all taxa, only limit by the data
taxon_to_mmap_to_orthologs = {taxon: write_kgmls(
metabolic_maps, f'{resources_directory}/kc_kgmls', org='ko') for taxon in taxa}
else:
for taxon in tqdm(taxa, desc=f'Getting information for {len(taxa) - 1} taxa'):
kegg_prefix = taxon2prefix(taxon, taxa_df)
if kegg_prefix is not None:
taxon_mmaps = get_taxon_maps(kegg_prefix)
taxon_mmaps = [mmap for mmap in taxon_mmaps if mmap in metabolic_maps] # select only inputted maps
taxon_to_mmap_to_orthologs[taxon] = write_kgmls(
taxon_mmaps, f'{resources_directory}/kc_kgmls', org=kegg_prefix)
else:
if map_non_kegg_genomes:
taxon_to_mmap_to_orthologs[taxon] = write_kgmls(
metabolic_maps, f'{resources_directory}/kc_kgmls', org='ko')
else:
taxon_to_mmap_to_orthologs[taxon] = {}
with open(f'{resources_directory}/taxon_to_mmap_to_orthologs.json', 'w') as f:
json.dump(taxon_to_mmap_to_orthologs, f)
return taxon_to_mmap_to_orthologs
Expand Down Expand Up @@ -459,7 +480,8 @@ def main():
data.to_csv(f'{args.output}/data_for_charting.tsv', sep='\t', index=False)
if not args.input_taxonomy:
taxon_to_mmap_to_orthologs = download_resources(
data, args.resources_directory, args.taxa_column, args.metabolic_maps)
data, args.resources_directory, args.taxa_column, args.metabolic_maps, map_all=args.map_all,
map_non_kegg_genomes=args.include_missing_genomes)
h = open(f"{args.output}/taxon_to_mmap_to_orthologs.json", "w")
json.dump(taxon_to_mmap_to_orthologs, h)
else:
Expand Down
5 changes: 2 additions & 3 deletions keggpathway_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def pathway_box_list(self, taxa_in_box, dic_colors, maxshared=10):
if nrboxes > maxshared:
nrboxes = maxshared

paired = True if nrboxes % 2 == 0 else False
paired = nrboxes % 2 == 0
for i in range(nrboxes):
newrecord = create_box_heatmap(
self.orthologs[boxidx], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2),
Expand All @@ -291,7 +291,6 @@ def pathway_boxes_differential(self, dataframe, log=True, colormap="coolwarm"):
:param log: bol providing the option for a log transformation of data
:param colormap: str representing a costum matplotlib colormap to be used
"""

if log:
norm = cm.colors.LogNorm(vmin=dataframe.min().min(), vmax=dataframe.max().max())
else:
Expand All @@ -308,7 +307,7 @@ def pathway_boxes_differential(self, dataframe, log=True, colormap="coolwarm"):
for box in dataframe.index.tolist():
boxidx = self.ortho_ids_to_pos[box] # get box index
colors = dataframe.loc[box].tolist()
paired = True if nrboxes % 2 == 0 else False
paired = nrboxes % 2 == 0
for i in range(nrboxes):
newrecord = create_box_heatmap(
self.orthologs[boxidx], nrboxes, i * 2 - (nrboxes - 1) if paired else i - int(nrboxes / 2),
Expand Down

0 comments on commit f01076e

Please sign in to comment.