Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to input processors due to changes in sources #1454

Merged
merged 7 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions indra/sources/biopax/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
assert_valid_db_refs, validate_id
from indra.ontology.standardize import standardize_name_db_refs
from indra.databases import hgnc_client, uniprot_client, chebi_client, \
parse_identifiers_url
parse_identifiers_url, bioregistry_client

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -315,12 +315,12 @@ def get_activity_modification(self):
assert isinstance(agent, Agent)
if gained_mods:
ag = copy.deepcopy(agent)
ag.mods = gained_mods
ag.mods = list(gained_mods)
stmt = ActiveForm(ag, 'activity', is_active, evidence=ev)
self.statements.append(stmt)
if lost_mods:
ag = copy.deepcopy(agent)
ag.mods = lost_mods
ag.mods = list(lost_mods)
stmt = ActiveForm(ag, 'activity', not is_active, evidence=ev)
self.statements.append(stmt)

Expand Down Expand Up @@ -664,7 +664,14 @@ def _get_reference_primary_id(entref: bp.EntityReference):
# This is a simple check to see if we should treat this as a URL
if not entref.uid.startswith('http'):
return None, None
primary_ns, primary_id = parse_identifiers_url(entref.uid)
url = entref.uid
if 'bioregistry' in url:
primary_ns, primary_id = \
bioregistry_client.get_ns_id_from_bioregistry_curie(
url.split("/")[-1]
)
else:
primary_ns, primary_id = parse_identifiers_url(url)
return primary_ns, primary_id

@staticmethod
Expand Down
14 changes: 7 additions & 7 deletions indra/sources/dgi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@

USECOLS = [
"gene_name",
"entrez_id",
"interaction_claim_source",
"interaction_types",
"gene_concept_id",
"interaction_source_db_name",
"interaction_type",
"drug_name",
"drug_concept_id",
"PMIDs",
]


Expand Down Expand Up @@ -87,9 +86,10 @@ def get_version_df(version: Optional[str] = None) -> Tuple[str, pd.DataFrame]:
else:
version = bioversions.get_version("Drug Gene Interaction Database")
if version is None:
version = "2021-Jan"
logger.warning(f"Could not find version with bioregistry, using"
version = "latest"
logger.warning(f"Could not find version with bioregistry, using "
f"version {version}.")
url = f"https://www.dgidb.org/data/monthly_tsvs/{version}/interactions.tsv"
url = f"https://www.dgidb.org/data/{version}/interactions.tsv"
df = pd.read_csv(url, usecols=USECOLS, sep="\t", dtype=str)
df = df[USECOLS]
return version, df
15 changes: 12 additions & 3 deletions indra/sources/dgi/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd

from ...ontology.bio import bio_ontology
from ...ontology.standardize import get_standard_agent
from ...statements import (
default_ns_order,
Expand Down Expand Up @@ -97,7 +98,13 @@ def row_to_statements(
pmids,
) -> Iterable[Statement]:
"""Convert a row in the DGI dataframe to a statement."""
gene_agent = get_standard_agent(gene_name, {"EGID": ncbigene_id})
if bio_ontology.get_id_from_name('HGNC', gene_name):
_, gene_id = bio_ontology.get_id_from_name('HGNC', gene_name)
gene_agent = get_standard_agent(gene_name, {"HGNC": gene_id})
else:
self.skipped += 1
return


try:
drug_namespace, drug_identifier = drug_curie.split(":", 1)
Expand Down Expand Up @@ -133,9 +140,9 @@ def row_to_statements(
def process_df(df: pd.DataFrame) -> pd.DataFrame:
"""Process the DGI interactions dataframe."""
# remove rows with missing information
df = df[df["entrez_id"].notna()]
df = df[df["gene_concept_id"].notna()]
df = df[df["drug_concept_id"].notna()]
df["PMIDs"] = df["PMIDs"].map(_safe_split)
df['pmids'] = None
return df


Expand Down Expand Up @@ -257,6 +264,8 @@ def _safe_split(s: str) -> List[str]:
"multitarget",
"vaccine",
"nan",
'immunotherapy',
'other/unknown',
}

_UNHANDLED = set()
Expand Down
23 changes: 21 additions & 2 deletions indra/sources/rlimsp/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
'process_from_json_file',
'process_from_jsonish_str']

import os
import json
import logging
import requests
from multiprocessing import Pool

from .processor import RlimspProcessor

Expand Down Expand Up @@ -84,8 +86,13 @@ def process_jsonl_file(filename, doc_id_type=None):
rp.extract_statements()
return rp

def process_line(line):
try:
return json.loads(line)
except json.JSONDecodeError:
return None

def process_jsonl_str(jsonl_str, doc_id_type=None):
def process_jsonl_str(jsonl_str, doc_id_type=None, num_processes=None):
"""Process RLIMSP extractions from a JSON-L string.

Parameters
Expand All @@ -98,14 +105,26 @@ def process_jsonl_str(jsonl_str, doc_id_type=None):
'pmcid' explicitly, instead if contains a 'docId' key. This parameter
allows defining what ID type 'docId' sould be interpreted as. Its
values should be 'pmid' or 'pmcid' or None if not used.
num_processes : Optional[int]
The number of processes to use for parallel processing of the JSON-L
lines. If None, the number of processes is set to the number of CPUs
on the machine. If 1, no parallel processing is done.
Otherwise the provided nmber of processes is used.

Returns
-------
:py:class:`indra.sources.rlimsp.processor.RlimspProcessor`
An RlimspProcessor which contains a list of extracted INDRA Statements
in its statements attribute.
"""
json_list = [json.loads(line) for line in jsonl_str.splitlines()]
if num_processes is None:
num_processes = os.cpu_count()
if num_processes > 1:
with Pool(num_processes) as pool:
json_list = pool.map(process_line, jsonl_str.splitlines())
else:
json_list = [process_line(line) for line in jsonl_str.splitlines()]
json_list = [obj for obj in json_list if obj is not None]
rp = RlimspProcessor(json_list, doc_id_type=doc_id_type)
rp.extract_statements()
return rp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
gene_name gene_claim_name entrez_id interaction_claim_source interaction_types drug_claim_name drug_claim_primary_name drug_name drug_concept_id interaction_group_score PMIDs
CDK7 CDK7 1022 CancerCommons inhibitor SNS-032 SNS-032 BMS-387032 chembl:CHEMBL296468 0.82
VDR VDR 7421 DTC NIFEKALANT NIFEKALANT NIFEKALANT chembl:CHEMBL360861 0.12
gene_claim_name gene_concept_id gene_name interaction_source_db_name interaction_source_db_version interaction_type interaction_score drug_claim_name drug_concept_id drug_name approved immunotherapy anti_neoplastic
CDK7 hgnc:1778 CDK7 ChEMBL 32 inhibitor 0.362954178 BMS-387032 iuphar.ligand:5670 BMS-387032 FALSE FALSE FALSE
CYP3A ncbigene:1574 CYP3A PharmGKB 5-Jun-23 NULL 0.090599161 clozapine rxcui:2626 CLOZAPINE TRUE FALSE FALSE
9 changes: 4 additions & 5 deletions indra/tests/test_sources/test_dgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,19 @@
def test_dgi_processor():
"""Test the DGI processor."""
df = pd.read_csv(TEST_FILE, sep='\t', usecols=USECOLS, dtype=str)
df = df[USECOLS]
dp = process_df(df)
statement = dp.statements[0]
assert isinstance(statement, Inhibition)
assert statement.obj.name == 'CDK7'
assert statement.obj.db_refs['EGID'] == '1022'
assert statement.obj.db_refs['HGNC'] == '1778'
assert statement.subj.name == \
(r'N-(5-\{[(5-tert-butyl-1,3-oxazol-2-yl)methyl]sulfanyl\}-1,3-'
r'thiazol-2-yl)piperidine-4-carboxamide'), statement.subj.name
assert statement.subj.db_refs['CHEMBL'] == 'CHEMBL296468'
assert statement.subj.name == 'BMS-387032'
assert statement.subj.db_refs['IUPHAR.LIGAND'] == '5670'
assert 1 == len(statement.evidence)
evidence = statement.evidence[0]
assert evidence.pmid is None
assert evidence.annotations == {'source': 'CancerCommons',
assert evidence.annotations == {'source': 'ChEMBL',
'interactions': 'inhibitor'}


Expand Down
Loading