Skip to content

Commit

Permalink
Merge branch 'master' of github.com:merenlab/anvio
Browse files Browse the repository at this point in the history
  • Loading branch information
meren committed Sep 30, 2024
2 parents cd3c00e + 0fdf682 commit bb161f0
Show file tree
Hide file tree
Showing 24 changed files with 3,046 additions and 988 deletions.
20 changes: 20 additions & 0 deletions anvio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2541,6 +2541,26 @@ def TABULATE(table, header, numalign="right", max_width=0):
"and internal anvi'o heuristics control whether or not indels should be reported, but with this "
"flag all indels are reported."}
),
'list-defline-variables': (
['--list-defline-variables'],
{'default': False,
'action': 'store_true',
'help': "When declared, anvi'o will list the variable names that can be used to construct deflines in "
"FASTA outputs from the user-defined `--defline-format` strings."}
),
'defline-format': (
['--defline-format'],
{'default': '{gene_caller_id}',
'metavar': "F-STRING",
'help': "Proivide a defline template for anvi'o to use when generating the FASTA output. The way this "
"works is actually quite simple: first you learn about all the options that exist using the "
"`--list-defline-variables`, and then use them to create your template. Available variables "
"should be listed within curly brackets, which will be evaluated in contex. Anything outside "
"of curly brackets will be kept as is. For instance, if you would like your defline to have "
"the gene caller ID after the contig name in which it occurs, you can use this template: "
"'{contig_name}_{gene_caller_id}', and your defline will look like '>XXX_182'. See more "
"examples in online help."}
),
'report-extended-deflines': (
['--report-extended-deflines'],
{'default': False,
Expand Down
6 changes: 6 additions & 0 deletions anvio/data/misc/KEGG-SNAPSHOTS.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ v2024-09-08:
hash: 5a9644d40061
modules_db_version: 4

v2024-09-29:
url: https://figshare.com/ndownloader/files/49500822
archive_name: KEGG_build_2024-09-29_5a9644d40061.tar.gz
hash: 5a9644d40061
modules_db_version: 4

# How to add a new KEGG snapshot to this file:
# 1. download the latest data directly from KEGG by running
# `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5 --include-stray-KOs`
Expand Down
72 changes: 64 additions & 8 deletions anvio/dbops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,10 +1120,42 @@ def get_gene_amino_acid_sequence(self, gene_caller_ids):
return sequences


def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True, include_aa_sequences=False, flank_length=0,
output_file_path_external_gene_calls=None, simple_headers=False, report_aa_sequences=False, wrap=120, rna_alphabet=False):
def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True,
include_aa_sequences=False, flank_length=0, output_file_path_external_gene_calls=None,
simple_headers=False, list_defline_variables=False, defline_format='{gene_caller_id}',
report_aa_sequences=False, wrap=120, rna_alphabet=False):

##################################################################################################
#
# DEFLIINE FORMATTING REPORTING RELATED PRE-CHECKS
#
##################################################################################################
# available options to determine deflines through user-provided f-strings. the dictionary is
# populated below, and if you make any changes here, please don't forget to update it there too:
defline_data_dict = {'gene_caller_id': None,
'contig_name': None,
'start': None,
'stop': None,
'direction': None,
'length': None,
'contigs_db_project_name': None}

# if the user needs to see the list, show the list and quit
if list_defline_variables:
self.run.warning(f"Here are the variables you can use to provide a user-defined defline template: ")
for key in defline_data_dict.keys():
self.run.info_single("{%s}" % key)
self.run.info_single("Remember, by default, anvi'o will only use '{gene_caller_id}' to format the deflines of "
"FASTA files it produces.", level=0, nl_before=1, nl_after=1, mc='red')

sys.exit()

##################################################################################################
#
# BUNCH OF SANITY CHECKS BEFORE WE GET INTO BUSINESS
#
##################################################################################################

# bunch of sanity checks below
if not isinstance(gene_caller_ids_list, list):
raise ConfigError("Gene caller's ids must be of type 'list'")

Expand Down Expand Up @@ -1166,6 +1198,20 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
"also ask FASTA file headers for gene sequences to be not simple. External gene calls file and the FASTA "
"file must match, and anvi'o will have to take care of it without your supervision.")

# if we came all the way down here without a defline format, let's set one up:
if not defline_format:
defline_format = "{gene_caller_id}"

# we will also check if the `defline_format` is composed of variables that are defined in
# the `defline_data_dict` which is filled later
utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)

##################################################################################################
#
# BUSINESS TIME
#
##################################################################################################

# finally getting our sequences initialized. please NOTE that we do it only if there are no
# contig sequences available OR if the gene caller ids of interest is not represented among
# those that were previously initialized.
Expand Down Expand Up @@ -1255,6 +1301,17 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
else:
gene_call['aa_sequence'] = None

# let's populate the dictionary that holds all the information that could be used to report
# gene FASTA files. if you change anything in this dictionary, please don't forget to
# update the list of variables where it is first defined in this function.
defline_data_dict = {'gene_caller_id': gene_callers_id,
'contig_name': gene_call['contig'],
'start': gene_call['start'],
'stop': gene_call['stop'],
'direction': gene_call['direction'],
'length': gene_call['length'],
'contigs_db_project_name': self.a_meta['project_name_str']}

if output_file_path_external_gene_calls:
# if the user is asking for an external gene calls file, the FASTA file for sequences
# should not start with digits and we also need to set the contig name in sequences
Expand All @@ -1266,10 +1323,9 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
gene_call['start'] = 0
gene_call['stop'] = gene_call['length']
else:
if simple_headers:
gene_call['header'] = '%d' % (gene_callers_id)
else:
gene_call['header'] = '%d ' % (gene_callers_id) + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])
gene_call['header'] = utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)
if not simple_headers:
gene_call['header'] += gene_call['header'] + ' ' + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])

# adding the updated gene call to our sequences dict.
sequences_dict[gene_callers_id] = gene_call
Expand Down Expand Up @@ -4247,7 +4303,7 @@ def init(self):

# set a project name for the contigs database without any funny
# characters to make sure it can be used programmatically later.
self.meta['project_name_str'] = self.meta['project_name'].translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_') \
self.meta['project_name_str'] = self.meta['project_name'].strip().translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_').strip('_') \
if self.meta['project_name'] else '___'.join(['UNKNOWN', self.meta['contigs_db_hash']])

if 'creation_date' not in self.meta:
Expand Down
4 changes: 3 additions & 1 deletion anvio/docs/artifacts/kegg-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ KEGG
|- MODULES.db
|- ko_list.txt
|- modules.keg
|- br08901.json
|- hierarchies.json
|- HMMs
| |- Kofam.hmm
Expand Down Expand Up @@ -56,7 +57,8 @@ However, for the curious, here is a description of each component in this data d
- The `orphan_data` subfolder: contains KOfam profiles for KOs that do not have a bitscore threshold in the `ko_list.txt` file (in the `.hmm` file) and their corresponding entries in from the `ko_list.txt` file (in `01_ko_fams_with_no_threshold.txt`). Please note that KOs from the `orphan_data` directory will *not* be annotated in your %(contigs-db)s when you run %(anvi-run-kegg-kofams)s. However, if you ever need to take a look at these profiles or use them in any way, here they are. :)
- `modules.keg`: a flat text file describing all metabolic modules available in the [KEGG MODULE](https://www.genome.jp/kegg/module.html) resource. This includes pathway and signature modules, but not reaction modules.
- The `modules` subfolder: contains flat text files, one for each metabolic module, downloaded using the [KEGG REST API](https://www.kegg.jp/kegg/rest/keggapi.html). Each file describes a metabolic module's definition, classification, component orthologs, metabolic reactions, compounds, and any miscellaneous data like references and such. For an example, see the [module file for M00001](https://rest.kegg.jp/get/M00001/).
- `hierarchies.json`: a JSON-formatted file describing the available functional hierarchies in the [KEGG BRITE](https://www.genome.jp/kegg/brite.html) resource.
- `br08901.json`: a JSON-formatted KEGG BRITE [file](https://rest.kegg.jp/get/br:br08901/json) classifying [KEGG pathway maps](https://www.genome.jp/kegg/pathway.html).
- `hierarchies.json`: a JSON-formatted KEGG BRITE [file](https://rest.kegg.jp/get/br:br08902/json) describing the available functional hierarchies in the [KEGG BRITE](https://www.genome.jp/kegg/brite.html) resource.
- The `BRITE` subfolder: contains JSON-formatted files, each one of which describes a BRITE hierarchy.
- `MODULES.db`: a SQLite database containing data parsed from the module files and BRITE hierarchies. See %(modules-db)s.

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit bb161f0

Please sign in to comment.