#97 taxonomy config issues fixed

obophenotype · Sep 10, 2021 · e486ecc · e486ecc
1 parent 04e290b
commit e486ecc
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 329 deletions.
diff --git a/src/config/config_schema.json b/src/config/config_schema.json
@@ -56,6 +56,13 @@
             "description": "Brain region abbreviation."
           }
         },
+        "Reference_gene_list": {
+          "type": "array",
+          "items": {
+            "type": "string",
+            "description": "Reference gene list."
+          }
+        },
         "Root_nodes": {
           "type": "array",
           "items": {

diff --git a/src/dendrograms/taxonomy_details.yaml b/src/dendrograms/taxonomy_details.yaml
@@ -12,7 +12,8 @@
     - UBERON:0001384
   Brain_region_abbv:
     - MOp
-  Ensemble_data: Ensmusg
+  Reference_gene_list:
+    - Ensmusg
   Root_nodes:
     - Node: CS202002013_123
       Cell_type: CL:0011005 
@@ -144,7 +145,8 @@
     - NCBITaxon:9606
   Brain_region:
     - UBERON:0001384
-  Ensemble_data: Ensg
+  Reference_gene_list:
+    - Ensg
   Root_nodes:
     - Node: CS201912131_149
       Cell_type: CL:0011005 
@@ -173,34 +175,27 @@
   non_taxonomy_roots:      
     - Node: CS201912131_128
       Cell_type: CL:4023011
-      Location_relation: has_soma_location
       #Lamp5
     - Node: CS201912131_129
       Cell_type: CL:4023015
-      Location_relation: has_soma_location
       #Sncg
     - Node: CS201912131_130
       Cell_type: CL:4023016
-      Location_relation: has_soma_location
       #Vip
     - Node: CS201912131_131
       Cell_type: CL:4023017
-      Location_relation: has_soma_location
       #Sst
     - Node: CS201912131_132
       Cell_type: CL:4023018
-      Location_relation: part_of
       #Pvalb
     - Node: CS201912131_147
       Cell_type: CL:0002453
-      Location_relation: part_of
       #OPC
     - Node: CS201912131_151
       Cell_type: CL:0000125
       # non-neuronal currently mapped as glial cell
     - Node: CS201912131_140
       Cell_type: CL:0000128
-      Location_relation: part_of
       #Oligodendrocyte
 -
   Taxonomy_id: CCN201912132
@@ -212,7 +207,8 @@
     - NCBITaxon:9483
   Brain_region:
     - UBERON:0001384
-  Ensemble_data: Enscjag
+  Reference_gene_list:
+    - Enscjag
   Root_nodes:
     - Node: CS201912132_116
       Cell_type: CL:0011005
@@ -249,27 +245,21 @@
   non_taxonomy_roots:  
     - Node: CS201912132_105
       Cell_type: CL:0000128
-      Location_relation: part_of
       #Oligodendrocyte
     - Node: CS201912132_95
       Cell_type: CL:4023011
-      Location_relation: has_soma_location
       #Lamp5
     - Node: CS201912132_96
       Cell_type: CL:4023015
-      Location_relation: has_soma_location
       #Sncg
     - Node: CS201912132_97
       Cell_type: CL:4023016
-      Location_relation: has_soma_location
       #Vip
     - Node: CS201912132_98
       Cell_type: CL:4023017
-      Location_relation: has_soma_location
       #Sst
     - Node: CS201912132_99
       Cell_type: CL:4023018
-      Location_relation: part_of
       #Pvalb
     - Node: CS201912131_151
       Cell_type: CL:0000125

diff --git a/src/markers/CS202002013_markers_denormalized.tsv b/src/markers/CS202002013_markers_denormalized.tsv
diff --git a/src/patterns/data/default/CCN202002013_class_base.tsv b/src/patterns/data/default/CCN202002013_class_base.tsv
diff --git a/src/scripts/dosdp_pattern_generation.py b/src/scripts/dosdp_pattern_generation.py
@@ -2,7 +2,7 @@
 import os
 
 from dendrogram_tools import dend_json_2_nodes_n_edges
-from template_generation_utils import read_taxonomy_config, get_subtrees, read_dendrogram_tree, read_ensemble_data, \
+from template_generation_utils import read_taxonomy_config, get_subtrees, read_dendrogram_tree, read_gene_data, \
     read_markers, get_gross_cell_type
 
 MARKER_PATH = '../markers/CS{}_markers.tsv'
@@ -23,13 +23,13 @@ def generate_pattern_table_denormalised_markers(dend_json_path, output_filepath)
     dend_tree = read_dendrogram_tree(dend_json_path)
 
     marker_path = MARKER_PATH.format(str(taxon).replace("CCN", ""))
-    ensemble_path = ENSEMBLE_PATH.format(str(taxonomy_config["Ensemble_data"]).strip().lower())
+    gene_db_path = ENSEMBLE_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
 
     if taxonomy_config:
         subtrees = get_subtrees(dend_tree, taxonomy_config)
-        ensmusg_names = read_ensemble_data(ensemble_path)
-        denorm_markers = get_denorm_markers(taxon, ensmusg_names)
-        minimal_markers = read_markers(marker_path, ensmusg_names)
+        gene_names = read_gene_data(gene_db_path)
+        denorm_markers = get_denorm_markers(taxon, gene_names)
+        minimal_markers = read_markers(marker_path, gene_names)
 
         dl = []
         for o in dend['nodes']:

diff --git a/src/scripts/marker_tools.py b/src/scripts/marker_tools.py
@@ -175,7 +175,7 @@ def generate_marker_table(marker_data, output_filepath):
         d = dict()
         d['Taxonomy_node_ID'] = o
         d['clusterName'] = marker_data[o][CLUSTER]
-        d['Markers'] = EXPRESSION_SEPARATOR.join(marker_data[o][EXPRESSIONS])
+        d['Markers'] = EXPRESSION_SEPARATOR.join(sorted(marker_data[o][EXPRESSIONS]))
         for k in robot_marker_template_seed.keys():
             if not (k in d.keys()):
                 d[k] = ''

diff --git a/src/scripts/template_generation_tools.py b/src/scripts/template_generation_tools.py
@@ -6,7 +6,7 @@
 from dendrogram_tools import dend_json_2_nodes_n_edges
 from template_generation_utils import get_synonyms_from_taxonomy, get_synonym_pairs, read_taxonomy_config, \
     get_subtrees, read_dendrogram_tree, get_dend_subtrees, index_dendrogram,\
-    read_csv, read_ensemble_data, read_markers, get_gross_cell_type, merge_tables
+    read_csv, read_gene_data, read_markers, get_gross_cell_type, merge_tables
 
 
 log = logging.getLogger(__name__)
@@ -85,13 +85,13 @@ def generate_base_class_template(dend_json_path, output_filepath):
 
     marker_path = MARKER_PATH.format(str(taxon).replace("CCN", ""))
     allen_marker_path = ALLEN_MARKER_PATH.format(str(taxon).replace("CCN", ""))
-    ensemble_path = ENSEMBLE_PATH.format(str(taxonomy_config["Ensemble_data"]).strip().lower())
+    gene_db_path = ENSEMBLE_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
 
     if taxonomy_config:
         subtrees = get_subtrees(dend_tree, taxonomy_config)
-        ensmusg_names = read_ensemble_data(ensemble_path)
-        minimal_markers = read_markers(marker_path, ensmusg_names)
-        allen_markers = read_markers(allen_marker_path, ensmusg_names)
+        gene_names = read_gene_data(gene_db_path)
+        minimal_markers = read_markers(marker_path, gene_names)
+        allen_markers = read_markers(allen_marker_path, gene_names)
 
         robot_class_curation_seed = ['defined_class',
                                      'prefLabel',

diff --git a/src/scripts/template_generation_utils.py b/src/scripts/template_generation_utils.py
@@ -27,8 +27,9 @@ def get_synonyms_from_taxonomy(node):
     """
     synonym_properties = ['cell_set_preferred_alias', 'original_label', 'cell_set_label', 'cell_set_aligned_alias',
                           'cell_set_additional_aliases']
+    synonyms = {node[prop] for prop in synonym_properties if prop in node.keys() and node[prop]}
 
-    return OR_SEPARATOR.join({node[prop] for prop in synonym_properties if prop in node.keys() and node[prop]})
+    return OR_SEPARATOR.join(sorted(synonyms))
 
 
 def get_synonym_pairs(node):
@@ -270,17 +271,17 @@ def index_dendrogram(dend):
     return dend_dict
 
 
-def read_ensemble_data(ensemble_path):
-    ensemble = {}
-    with open(ensemble_path) as fd:
+def read_gene_data(gene_db_path):
+    genes = {}
+    with open(gene_db_path) as fd:
         rd = csv.reader(fd, delimiter="\t", quotechar='"')
-        # skip first 2 rows
+        # skip first 2 header rows
         next(rd)
         next(rd)
         for row in rd:
             _id = row[0]
-            ensemble[_id] = row[2]
-    return ensemble
+            genes[_id] = row[2]
+    return genes
 
 
 def read_markers(marker_path, ensmusg_names):