Skip to content

Commit

Permalink
Add UniProt target mapping functions (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored Nov 19, 2022
1 parent a4a59f4 commit 9ae842f
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/chembl_downloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
download_readme,
download_sdf,
download_sqlite,
download_uniprot_mapping,
get_chemreps_df,
get_date,
get_monomer_library_root,
get_substructure_library,
get_uniprot_mapping_df,
latest,
query,
supplier,
Expand Down
73 changes: 73 additions & 0 deletions src/chembl_downloader/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
# Monomers
"download_monomer_library",
"get_monomer_library_root",
# UniProt mappings
"download_uniprot_mapping",
"get_uniprot_mapping_df",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -588,3 +591,73 @@ def get_date(version: str, **kwargs) -> str:
else:
day, month, year = date_p.split("/")
return f"{year}-{month}-{day}"


def download_uniprot_mapping(
version: Optional[str] = None,
*,
prefix: Optional[Sequence[str]] = None,
return_version: bool = False,
):
"""Ensure the latest ChEMBL-UniProt target mapping TSV file.
:param version: The version number of ChEMBL to get. If none specified, uses
:func:`latest` to look up the latest.
:param prefix: The directory inside :mod:`pystow` to use
:param return_version: Should the version get returned? Turn this to true
if you're looking up the latest version and want to reduce redundant code.
:return: If ``return_version`` is true, return a pair of the version and the
local file path to the downloaded ``*.txt`` file. Otherwise,
just return the path.
"""
return _download_helper(
"chembl_uniprot_mapping.txt",
version=version,
prefix=prefix,
return_version=return_version,
filename_repeats_version=False,
)


def get_uniprot_mapping_df(
version: Optional[str] = None,
*,
prefix: Optional[Sequence[str]] = None,
) -> "pandas.DataFrame":
"""Download and parse the latest ChEMBL-UniProt target mapping TSV file.
:param version:
The version number of ChEMBL to get. If none specified, uses
:func:`latest` to look up the latest.
:param prefix: The directory inside :mod:`pystow` to use
:return: A dataframe with four columns:
1. ``uniprot_id``
2. ``chembl_target_id``
3. ``name``, the name from ChEMBL
4. ``type``, which can have one of the following values:
- ``CHIMERIC PROTEIN``
- ``NUCLEIC-ACID``
- ``PROTEIN COMPLEX``
- ``PROTEIN COMPLEX GROUP``
- ``PROTEIN FAMILY``
- ``PROTEIN NUCLEIC-ACID COMPLEX``
- ``PROTEIN-PROTEIN INTERACTION``
- ``SELECTIVITY GROUP``
- ``SINGLE PROTEIN``
"""
import pandas as pd

path = cast(
Path, download_uniprot_mapping(version=version, prefix=prefix, return_version=False)
)
df = pd.read_csv(
path,
sep="\t",
skiprows=1,
header=None,
names=["uniprot_id", "chembl_target_id", "name", "type"],
)
return df
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ deps =
flake8-bugbear
flake8-colors
flake8-docstrings
flake8-isort
flake8-isort==5.0.0 # remove when this gets fixed following nov 18/19 releases
flake8-print
pep8-naming
pydocstyle
Expand Down

0 comments on commit 9ae842f

Please sign in to comment.