Merge branch 'master' of github.com:merenlab/anvio

merenlab · Sep 30, 2024 · bb161f0 · bb161f0
2 parents cd3c00e + 0fdf682
commit bb161f0
Show file tree

Hide file tree

Showing 24 changed files with 3,046 additions and 988 deletions.
diff --git a/anvio/__init__.py b/anvio/__init__.py
@@ -2541,6 +2541,26 @@ def TABULATE(table, header, numalign="right", max_width=0):
                      "and internal anvi'o heuristics control whether or not indels should be reported, but with this "
                      "flag all indels are reported."}
                 ),
+    'list-defline-variables': (
+            ['--list-defline-variables'],
+            {'default': False,
+             'action': 'store_true',
+             'help': "When declared, anvi'o will list the variable names that can be used to construct deflines in "
+                     "FASTA outputs from the user-defined `--defline-format` strings."}
+                ),
+    'defline-format': (
+            ['--defline-format'],
+            {'default': '{gene_caller_id}',
+             'metavar': "F-STRING",
+             'help': "Proivide a defline template for anvi'o to use when generating the FASTA output. The way this "
+                     "works is actually quite simple: first you learn about all the options that exist using the "
+                     "`--list-defline-variables`, and then use them to create your template. Available variables "
+                     "should be listed within curly brackets, which will be evaluated in contex. Anything outside "
+                     "of curly brackets will be kept as is. For instance, if you would like your defline to have "
+                     "the gene caller ID after the contig name in which it occurs, you can use this template: "
+                     "'{contig_name}_{gene_caller_id}', and your defline will look like '>XXX_182'. See more "
+                     "examples in online help."}
+                ),
     'report-extended-deflines': (
             ['--report-extended-deflines'],
             {'default': False,

diff --git a/anvio/data/misc/KEGG-SNAPSHOTS.yaml b/anvio/data/misc/KEGG-SNAPSHOTS.yaml
@@ -112,6 +112,12 @@ v2024-09-08:
     hash: 5a9644d40061
     modules_db_version: 4
 
+v2024-09-29:
+    url: https://figshare.com/ndownloader/files/49500822
+    archive_name: KEGG_build_2024-09-29_5a9644d40061.tar.gz
+    hash: 5a9644d40061
+    modules_db_version: 4
+
 # How to add a new KEGG snapshot to this file:
 # 1. download the latest data directly from KEGG by running
 #    `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5 --include-stray-KOs`

diff --git a/anvio/dbops.py b/anvio/dbops.py
@@ -1120,10 +1120,42 @@ def get_gene_amino_acid_sequence(self, gene_caller_ids):
         return sequences
 
 
-    def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True, include_aa_sequences=False, flank_length=0,
-                                           output_file_path_external_gene_calls=None, simple_headers=False, report_aa_sequences=False, wrap=120, rna_alphabet=False):
+    def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True,
+                                           include_aa_sequences=False, flank_length=0, output_file_path_external_gene_calls=None,
+                                           simple_headers=False, list_defline_variables=False, defline_format='{gene_caller_id}',
+                                           report_aa_sequences=False, wrap=120, rna_alphabet=False):
+
+        ##################################################################################################
+        #
+        # DEFLIINE FORMATTING REPORTING RELATED PRE-CHECKS
+        #
+        ##################################################################################################
+        # available options to determine deflines through user-provided f-strings. the dictionary is
+        # populated below, and if you make any changes here, please don't forget to update it there too: 
+        defline_data_dict = {'gene_caller_id': None,
+                             'contig_name': None,
+                             'start': None,
+                             'stop': None,
+                             'direction': None,
+                             'length': None,
+                             'contigs_db_project_name': None} 
+
+        # if the user needs to see the list, show the list and quit
+        if list_defline_variables:
+            self.run.warning(f"Here are the variables you can use to provide a user-defined defline template: ")
+            for key in defline_data_dict.keys():
+                self.run.info_single("{%s}" % key)
+            self.run.info_single("Remember, by default, anvi'o will only use '{gene_caller_id}' to format the deflines of "
+                                 "FASTA files it produces.", level=0, nl_before=1, nl_after=1, mc='red')
+
+            sys.exit()
+
+        ##################################################################################################
+        #
+        # BUNCH OF SANITY CHECKS BEFORE WE GET INTO BUSINESS
+        #
+        ##################################################################################################
 
-        # bunch of sanity checks below
         if not isinstance(gene_caller_ids_list, list):
             raise ConfigError("Gene caller's ids must be of type 'list'")
 
@@ -1166,6 +1198,20 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
                               "also ask FASTA file headers for gene sequences to be not simple. External gene calls file and the FASTA "
                               "file must match, and anvi'o will have to take care of it without your supervision.")
 
+        # if we came all the way down here without a defline format, let's set one up:
+        if not defline_format:
+            defline_format = "{gene_caller_id}"
+
+        # we will also check if the `defline_format` is composed of variables that are defined in
+        # the  `defline_data_dict` which is filled later
+        utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)
+
+        ##################################################################################################
+        #
+        # BUSINESS TIME
+        #
+        ##################################################################################################
+
         # finally getting our sequences initialized. please NOTE that we do it only if there are no
         # contig sequences available OR if the gene caller ids of interest is not represented among
         # those that were previously initialized.
@@ -1255,6 +1301,17 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
                 else:
                     gene_call['aa_sequence'] = None
 
+            # let's populate the dictionary that holds all the information that could be used to report
+            # gene FASTA files. if you change anything in this dictionary, please don't forget to
+            # update the list of variables where it is first defined in this function.
+            defline_data_dict = {'gene_caller_id': gene_callers_id,
+                                 'contig_name': gene_call['contig'],
+                                 'start': gene_call['start'],
+                                 'stop': gene_call['stop'],
+                                 'direction': gene_call['direction'],
+                                 'length': gene_call['length'],
+                                 'contigs_db_project_name': self.a_meta['project_name_str']} 
+
             if output_file_path_external_gene_calls:
                 # if the user is asking for an external gene calls file, the FASTA file for sequences
                 # should not start with digits and we also need to set the contig name in sequences
@@ -1266,10 +1323,9 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
                     gene_call['start'] = 0
                     gene_call['stop'] = gene_call['length']
             else:
-                if simple_headers:
-                    gene_call['header'] = '%d' % (gene_callers_id)
-                else:
-                    gene_call['header'] = '%d ' % (gene_callers_id) + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])
+                gene_call['header'] = utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)
+                if not simple_headers:
+                    gene_call['header'] += gene_call['header'] + ' ' + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])
 
             # adding the updated gene call to our sequences dict.
             sequences_dict[gene_callers_id] = gene_call
@@ -4247,7 +4303,7 @@ def init(self):
 
         # set a project name for the contigs database without any funny
         # characters to make sure it can be used programmatically later.
-        self.meta['project_name_str'] = self.meta['project_name'].translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_') \
+        self.meta['project_name_str'] = self.meta['project_name'].strip().translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_').strip('_') \
                                 if self.meta['project_name'] else '___'.join(['UNKNOWN', self.meta['contigs_db_hash']])
 
         if 'creation_date' not in self.meta:

diff --git a/anvio/docs/artifacts/kegg-data.md b/anvio/docs/artifacts/kegg-data.md
@@ -24,6 +24,7 @@ KEGG
  |- MODULES.db
  |- ko_list.txt
  |- modules.keg
+ |- br08901.json
  |- hierarchies.json
  |- HMMs
  |   |- Kofam.hmm
@@ -56,7 +57,8 @@ However, for the curious, here is a description of each component in this data d
 - The `orphan_data` subfolder: contains KOfam profiles for KOs that do not have a bitscore threshold in the `ko_list.txt` file (in the `.hmm` file) and their corresponding entries in from the `ko_list.txt` file (in `01_ko_fams_with_no_threshold.txt`). Please note that KOs from the `orphan_data` directory will *not* be annotated in your %(contigs-db)s when you run %(anvi-run-kegg-kofams)s. However, if you ever need to take a look at these profiles or use them in any way, here they are. :)
 - `modules.keg`: a flat text file describing all metabolic modules available in the [KEGG MODULE](https://www.genome.jp/kegg/module.html) resource. This includes pathway and signature modules, but not reaction modules.
 - The `modules` subfolder: contains flat text files, one for each metabolic module, downloaded using the [KEGG REST API](https://www.kegg.jp/kegg/rest/keggapi.html). Each file describes a metabolic module's definition, classification, component orthologs, metabolic reactions, compounds, and any miscellaneous data like references and such. For an example, see the [module file for M00001](https://rest.kegg.jp/get/M00001/).
-- `hierarchies.json`: a JSON-formatted file describing the available functional hierarchies in the [KEGG BRITE](https://www.genome.jp/kegg/brite.html) resource.
+- `br08901.json`: a JSON-formatted KEGG BRITE [file](https://rest.kegg.jp/get/br:br08901/json) classifying [KEGG pathway maps](https://www.genome.jp/kegg/pathway.html).
+- `hierarchies.json`: a JSON-formatted KEGG BRITE [file](https://rest.kegg.jp/get/br:br08902/json) describing the available functional hierarchies in the [KEGG BRITE](https://www.genome.jp/kegg/brite.html) resource.
 - The `BRITE` subfolder: contains JSON-formatted files, each one of which describes a BRITE hierarchy.
 - `MODULES.db`: a SQLite database containing data parsed from the module files and BRITE hierarchies. See %(modules-db)s.
 

diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_group_grid.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_group_grid.png
diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_folate.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_folate.png
diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_galactose.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_galactose.png
diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_global.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/kos_db_groups_global.png
diff --git a/anvio/docs/images/png/anvi-draw-kegg-pathways/output_options.png b/anvio/docs/images/png/anvi-draw-kegg-pathways/output_options.png