sorgerlab · bgyori · Jul 23, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/indra/sources/biopax/processor.py b/indra/sources/biopax/processor.py
@@ -14,7 +14,7 @@
     assert_valid_db_refs, validate_id
 from indra.ontology.standardize import standardize_name_db_refs
 from indra.databases import hgnc_client, uniprot_client, chebi_client, \
-    parse_identifiers_url
+    parse_identifiers_url, bioregistry_client
 
 logger = logging.getLogger(__name__)
 
@@ -315,12 +315,12 @@ def get_activity_modification(self):
             assert isinstance(agent, Agent)
             if gained_mods:
                 ag = copy.deepcopy(agent)
-                ag.mods = gained_mods
+                ag.mods = list(gained_mods)
                 stmt = ActiveForm(ag, 'activity', is_active, evidence=ev)
                 self.statements.append(stmt)
             if lost_mods:
                 ag = copy.deepcopy(agent)
-                ag.mods = lost_mods
+                ag.mods = list(lost_mods)
                 stmt = ActiveForm(ag, 'activity', not is_active, evidence=ev)
                 self.statements.append(stmt)
 
@@ -664,7 +664,14 @@ def _get_reference_primary_id(entref: bp.EntityReference):
         # This is a simple check to see if we should treat this as a URL
         if not entref.uid.startswith('http'):
             return None, None
-        primary_ns, primary_id = parse_identifiers_url(entref.uid)
+        url = entref.uid
+        if 'bioregistry' in url:
+            primary_ns, primary_id = \
+                bioregistry_client.get_ns_id_from_bioregistry_curie(
+                    url.split("/")[-1]
+                )
+        else:
+            primary_ns, primary_id = parse_identifiers_url(url)
         return primary_ns, primary_id
 
     @staticmethod

diff --git a/indra/sources/dgi/api.py b/indra/sources/dgi/api.py
@@ -13,12 +13,11 @@
 
 USECOLS = [
     "gene_name",
-    "entrez_id",
-    "interaction_claim_source",
-    "interaction_types",
+    "gene_concept_id",
+    "interaction_source_db_name",
+    "interaction_type",
     "drug_name",
     "drug_concept_id",
-    "PMIDs",
 ]
 
 
@@ -87,9 +86,10 @@ def get_version_df(version: Optional[str] = None) -> Tuple[str, pd.DataFrame]:
         else:
             version = bioversions.get_version("Drug Gene Interaction Database")
     if version is None:
-        version = "2021-Jan"
-        logger.warning(f"Could not find version with bioregistry, using"
+        version = "latest"
+        logger.warning(f"Could not find version with bioregistry, using "
                        f"version {version}.")
-    url = f"https://www.dgidb.org/data/monthly_tsvs/{version}/interactions.tsv"
+    url = f"https://www.dgidb.org/data/{version}/interactions.tsv"
     df = pd.read_csv(url, usecols=USECOLS, sep="\t", dtype=str)
+    df = df[USECOLS]
     return version, df
diff --git a/indra/sources/dgi/processor.py b/indra/sources/dgi/processor.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 
+from ...ontology.bio import bio_ontology
 from ...ontology.standardize import get_standard_agent
 from ...statements import (
     default_ns_order,
@@ -97,7 +98,13 @@ def row_to_statements(
         pmids,
     ) -> Iterable[Statement]:
         """Convert a row in the DGI dataframe to a statement."""
-        gene_agent = get_standard_agent(gene_name, {"EGID": ncbigene_id})
+        if bio_ontology.get_id_from_name('HGNC', gene_name):
+            _, gene_id = bio_ontology.get_id_from_name('HGNC', gene_name)
+            gene_agent = get_standard_agent(gene_name, {"HGNC": gene_id})
+        else:
+            self.skipped += 1
+            return
+
 
         try:
             drug_namespace, drug_identifier = drug_curie.split(":", 1)
@@ -133,9 +140,9 @@ def row_to_statements(
 def process_df(df: pd.DataFrame) -> pd.DataFrame:
     """Process the DGI interactions dataframe."""
     # remove rows with missing information
-    df = df[df["entrez_id"].notna()]
+    df = df[df["gene_concept_id"].notna()]
     df = df[df["drug_concept_id"].notna()]
-    df["PMIDs"] = df["PMIDs"].map(_safe_split)
+    df['pmids'] = None
     return df
 
 
@@ -257,6 +264,8 @@ def _safe_split(s: str) -> List[str]:
     "multitarget",
     "vaccine",
     "nan",
+    'immunotherapy',
+    'other/unknown',
 }
 
 _UNHANDLED = set()

diff --git a/indra/sources/rlimsp/api.py b/indra/sources/rlimsp/api.py
@@ -4,9 +4,11 @@
            'process_from_json_file',
            'process_from_jsonish_str']
 
+import os
 import json
 import logging
 import requests
+from multiprocessing import Pool
 
 from .processor import RlimspProcessor
 
@@ -84,8 +86,13 @@ def process_jsonl_file(filename, doc_id_type=None):
         rp.extract_statements()
     return rp
 
+def process_line(line):
+    try:
+        return json.loads(line)
+    except json.JSONDecodeError:
+        return None
 
-def process_jsonl_str(jsonl_str, doc_id_type=None):
+def process_jsonl_str(jsonl_str, doc_id_type=None, num_processes=None):
     """Process RLIMSP extractions from a JSON-L string.
 
     Parameters
@@ -98,14 +105,26 @@ def process_jsonl_str(jsonl_str, doc_id_type=None):
         'pmcid' explicitly, instead if contains a 'docId' key. This parameter
         allows defining what ID type 'docId' sould be interpreted as. Its
         values should be 'pmid' or 'pmcid' or None if not used.
+    num_processes : Optional[int]
+        The number of processes to use for parallel processing of the JSON-L
+        lines. If None, the number of processes is set to the number of CPUs
+        on the machine. If 1, no parallel processing is done.
+        Otherwise the provided nmber of processes is used.
 
     Returns
     -------
     :py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
         An RlimspProcessor which contains a list of extracted INDRA Statements
         in its statements attribute.
     """
-    json_list = [json.loads(line) for line in jsonl_str.splitlines()]
+    if num_processes is None:
+        num_processes = os.cpu_count()
+    if num_processes > 1:
+        with Pool(num_processes) as pool:
+            json_list = pool.map(process_line, jsonl_str.splitlines())
+    else:
+        json_list = [process_line(line) for line in jsonl_str.splitlines()]
+    json_list = [obj for obj in json_list if obj is not None]
     rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
     rp.extract_statements()
     return rp

diff --git a/indra/tests/test_sources/resources/dgi_sample_interactions.tsv b/indra/tests/test_sources/resources/dgi_sample_interactions.tsv
@@ -1,3 +1,3 @@
-gene_name	gene_claim_name	entrez_id	interaction_claim_source	interaction_types	drug_claim_name	drug_claim_primary_name	drug_name	drug_concept_id	interaction_group_score	PMIDs
-CDK7	CDK7	1022	CancerCommons	inhibitor	SNS-032	SNS-032	BMS-387032	chembl:CHEMBL296468	0.82	
-VDR	VDR	7421	DTC		NIFEKALANT	NIFEKALANT	NIFEKALANT	chembl:CHEMBL360861	0.12	
+gene_claim_name	gene_concept_id	gene_name	interaction_source_db_name	interaction_source_db_version	interaction_type	interaction_score	drug_claim_name	drug_concept_id	drug_name	approved	immunotherapy	anti_neoplastic
+CDK7	hgnc:1778	CDK7	ChEMBL	32	inhibitor	0.362954178	BMS-387032	iuphar.ligand:5670	BMS-387032	FALSE	FALSE	FALSE
+CYP3A	ncbigene:1574	CYP3A	PharmGKB	5-Jun-23	NULL	0.090599161	clozapine	rxcui:2626	CLOZAPINE	TRUE	FALSE	FALSE
diff --git a/indra/tests/test_sources/test_dgi.py b/indra/tests/test_sources/test_dgi.py
@@ -17,20 +17,19 @@
 def test_dgi_processor():
     """Test the DGI processor."""
     df = pd.read_csv(TEST_FILE, sep='\t', usecols=USECOLS, dtype=str)
+    df = df[USECOLS]
     dp = process_df(df)
     statement = dp.statements[0]
     assert isinstance(statement, Inhibition)
     assert statement.obj.name == 'CDK7'
     assert statement.obj.db_refs['EGID'] == '1022'
     assert statement.obj.db_refs['HGNC'] == '1778'
-    assert statement.subj.name == \
-        (r'N-(5-\{[(5-tert-butyl-1,3-oxazol-2-yl)methyl]sulfanyl\}-1,3-'
-         r'thiazol-2-yl)piperidine-4-carboxamide'), statement.subj.name
-    assert statement.subj.db_refs['CHEMBL'] == 'CHEMBL296468'
+    assert statement.subj.name == 'BMS-387032'
+    assert statement.subj.db_refs['IUPHAR.LIGAND'] == '5670'
     assert 1 == len(statement.evidence)
     evidence = statement.evidence[0]
     assert evidence.pmid is None
-    assert evidence.annotations == {'source': 'CancerCommons',
+    assert evidence.annotations == {'source': 'ChEMBL',
                                     'interactions': 'inhibitor'}