From a4a59f4ad2368f174be4811f9fd070cffbbac574 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 3 Nov 2022 13:41:04 +0100 Subject: [PATCH] Improve ChEBI mapping notebook (#10) * Add stub for checking chebi reverse mappings * Update chebi-mappings.ipynb * Clean up notebooks --- notebooks/chebi-mappings.ipynb | 282 ++++++++++++++++++---------- notebooks/drug-indications.ipynb | 4 +- notebooks/refresh-static-data.ipynb | 4 +- tox.ini | 2 + 4 files changed, 187 insertions(+), 105 deletions(-) diff --git a/notebooks/chebi-mappings.ipynb b/notebooks/chebi-mappings.ipynb index 6a9ff55..afe0fa1 100644 --- a/notebooks/chebi-mappings.ipynb +++ b/notebooks/chebi-mappings.ipynb @@ -27,12 +27,13 @@ "import gilda.grounder\n", "import gilda.term\n", "import pandas as pd\n", + "import pystow\n", "from biomappings.gilda_utils import iter_prediction_tuples\n", "from gilda.process import normalize\n", - "from tqdm.auto import tqdm\n", "from IPython.display import Markdown\n", + "from tqdm.auto import tqdm\n", "\n", - "from chembl_downloader import queries, query, latest" + "from chembl_downloader import latest, queries, query" ] }, { @@ -63,7 +64,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fri Oct 28 15:16:09 2022\n" + "Thu Nov 3 13:32:50 2022\n" ] } ], @@ -114,7 +115,7 @@ " chembl_id,\n", " pref_name\n", "FROM MOLECULE_DICTIONARY\n", - "WHERE \n", + "WHERE\n", " chebi_par_id IS NULL\n", " AND pref_name IS NOT NULL\n", "```" @@ -151,7 +152,7 @@ "output_type": "stream", "text": [ "CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs\n", - "Wall time: 5.25 µs\n" + "Wall time: 5.96 µs\n" ] } ], @@ -278,6 +279,85 @@ "df" ] }, + { + "cell_type": "markdown", + "id": "8d9be11f", + "metadata": {}, + "source": [ + "## What's Already in ChEBI\n", + "\n", + "ChEBI also maintains its own mappings to ChEMBL - investigate if there's anything available there that is not already available in ChEMBL before moving on to propose new mappings." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9d196b15", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/cthoyt/.virtualenvs/cheminf310/lib/python3.10/site-packages/pystow/impl.py:599: DtypeWarning: Columns (1,3,4) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " return pd.read_csv(path, **_clean_csv_kwargs(read_csv_kwargs))\n" + ] + }, + { + "data": { + "text/plain": [ + "34312" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chebi_url = \"https://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/reference.tsv.gz\"\n", + "chebi_df = pystow.ensure_csv(\n", + " \"bio\",\n", + " \"chebi\",\n", + " url=chebi_url,\n", + " read_csv_kwargs=dict(\n", + " compression=\"gzip\",\n", + " sep=\"\\t\",\n", + " encoding=\"unicode_escape\",\n", + " on_bad_lines=\"skip\",\n", + " ),\n", + ")\n", + "chebi_mappings = dict(\n", + " chebi_df[chebi_df.REFERENCE_DB_NAME == \"ChEMBL\"][[\"REFERENCE_ID\", \"COMPOUND_ID\"]].values\n", + ")\n", + "len(chebi_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "397d2fd1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "there are 4,041/39,230 (10.30%) extra mappings from ChEBI\n" + ] + } + ], + "source": [ + "chebi_idx = df.chembl_id.isin(set(chebi_mappings))\n", + "\n", + "print(\n", + " f\"there are {chebi_idx.sum():,}/{len(df.index):,} ({chebi_idx.sum()/len(df.index):.2%}) \"\n", + " f\"extra mappings from ChEBI\"\n", + ")\n", + "\n", + "df = df[~chebi_idx]" + ] + }, { "cell_type": "markdown", "id": "2c8748be", @@ -290,19 +370,19 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "d87f8e43", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c7a8425031c4d3395beca67e8f7b685", + "model_id": "8c05aef5cbe44501938ec88d5fe86ffb", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0.00/39.2k [00:00\n", " \n", " \n", - " 4377\n", + " 497\n", " chebi\n", - " CHEBI:80180\n", - " Indolicidin\n", + " CHEBI:190867\n", + " 1-AMINOCYCLOBUTANE CARBOXYLIC ACID\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL2251917\n", - " Indolicidin\n", + " CHEMBL131244\n", + " 1-AMINOCYCLOBUTANE CARBOXYLIC ACID\n", " lexical\n", " 0.777778\n", " notebook\n", " \n", " \n", - " 3524\n", + " 904\n", " chebi\n", - " CHEBI:40701\n", - " ALRESTATIN\n", + " CHEBI:34827\n", + " M2\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL63055\n", - " ALRESTATIN\n", + " CHEMBL4525134\n", + " M2\n", " lexical\n", " 0.777778\n", " notebook\n", " \n", " \n", - " 1765\n", + " 927\n", " chebi\n", - " CHEBI:167900\n", - " PJ34\n", + " CHEBI:35811\n", + " 2-endo-hydroxy-1,8-cineole\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL3233481\n", - " PJ34\n", + " CHEMBL2229602\n", + " 2-endo-hydroxy-1,8-cineole\n", " lexical\n", " 0.777778\n", " notebook\n", " \n", " \n", - " 1766\n", + " 443\n", " chebi\n", - " CHEBI:167900\n", - " PJ34\n", + " CHEBI:188062\n", + " XYLOCARPUS A\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL372303\n", - " PJ34\n", + " CHEMBL3039346\n", + " XYLOCARPUS A\n", " lexical\n", " 0.777778\n", " notebook\n", " \n", " \n", - " 3520\n", + " 574\n", " chebi\n", - " CHEBI:40287\n", - " 7-NITROINDAZOLE\n", + " CHEBI:192723\n", + " L-NIO\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL247378\n", - " 7-NITROINDAZOLE\n", + " CHEMBL11471\n", + " L-NIO\n", " lexical\n", " 0.777778\n", " notebook\n", @@ -526,119 +606,119 @@ " ...\n", " \n", " \n", - " 4662\n", + " 852\n", " chebi\n", - " CHEBI:9423\n", - " Technetium tc 99m sestamibi\n", + " CHEBI:32187\n", + " Technetium Tc 99m succimer\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL4594241\n", - " TECHNETIUM TC 99M SESTAMIBI\n", + " CHEMBL1200797\n", + " TECHNETIUM TC 99M SUCCIMER\n", " lexical\n", " 0.723974\n", " notebook\n", " \n", " \n", - " 3310\n", + " 1182\n", " chebi\n", - " CHEBI:32187\n", - " Technetium Tc 99m succimer\n", + " CHEBI:5938\n", + " Interferon beta-1b\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL1200797\n", - " TECHNETIUM TC 99M SUCCIMER\n", + " CHEMBL1201563\n", + " INTERFERON BETA-1B\n", " lexical\n", " 0.723974\n", " notebook\n", " \n", " \n", - " 3774\n", + " 1183\n", " chebi\n", - " CHEBI:5937\n", - " Interferon alfa-2a\n", + " CHEBI:5939\n", + " Interferon gamma-1b\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL2108508\n", - " INTERFERON ALFA-2A\n", + " CHEMBL1201564\n", + " INTERFERON GAMMA-1B\n", " lexical\n", " 0.723974\n", " notebook\n", " \n", " \n", - " 3775\n", + " 1892\n", " chebi\n", - " CHEBI:5938\n", - " Interferon beta-1b\n", + " CHEBI:9423\n", + " Technetium tc 99m sestamibi\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL1201563\n", - " INTERFERON BETA-1B\n", + " CHEMBL4594241\n", + " TECHNETIUM TC 99M SESTAMIBI\n", " lexical\n", " 0.723974\n", " notebook\n", " \n", " \n", - " 3776\n", + " 1181\n", " chebi\n", - " CHEBI:5939\n", - " Interferon gamma-1b\n", + " CHEBI:5937\n", + " Interferon alfa-2a\n", " skos:exactMatch\n", " chembl.compound\n", - " CHEMBL1201564\n", - " INTERFERON GAMMA-1B\n", + " CHEMBL2108508\n", + " INTERFERON ALFA-2A\n", " lexical\n", " 0.723974\n", " notebook\n", " \n", " \n", "\n", - "

4720 rows × 10 columns

\n", + "

1921 rows × 10 columns

\n", "" ], "text/plain": [ - " source_prefix source_id source_name \\\n", - "4377 chebi CHEBI:80180 Indolicidin \n", - "3524 chebi CHEBI:40701 ALRESTATIN \n", - "1765 chebi CHEBI:167900 PJ34 \n", - "1766 chebi CHEBI:167900 PJ34 \n", - "3520 chebi CHEBI:40287 7-NITROINDAZOLE \n", - "... ... ... ... \n", - "4662 chebi CHEBI:9423 Technetium tc 99m sestamibi \n", - "3310 chebi CHEBI:32187 Technetium Tc 99m succimer \n", - "3774 chebi CHEBI:5937 Interferon alfa-2a \n", - "3775 chebi CHEBI:5938 Interferon beta-1b \n", - "3776 chebi CHEBI:5939 Interferon gamma-1b \n", + " source_prefix source_id source_name \\\n", + "497 chebi CHEBI:190867 1-AMINOCYCLOBUTANE CARBOXYLIC ACID \n", + "904 chebi CHEBI:34827 M2 \n", + "927 chebi CHEBI:35811 2-endo-hydroxy-1,8-cineole \n", + "443 chebi CHEBI:188062 XYLOCARPUS A \n", + "574 chebi CHEBI:192723 L-NIO \n", + "... ... ... ... \n", + "852 chebi CHEBI:32187 Technetium Tc 99m succimer \n", + "1182 chebi CHEBI:5938 Interferon beta-1b \n", + "1183 chebi CHEBI:5939 Interferon gamma-1b \n", + "1892 chebi CHEBI:9423 Technetium tc 99m sestamibi \n", + "1181 chebi CHEBI:5937 Interferon alfa-2a \n", "\n", " relation target_prefix target_identifier \\\n", - "4377 skos:exactMatch chembl.compound CHEMBL2251917 \n", - "3524 skos:exactMatch chembl.compound CHEMBL63055 \n", - "1765 skos:exactMatch chembl.compound CHEMBL3233481 \n", - "1766 skos:exactMatch chembl.compound CHEMBL372303 \n", - "3520 skos:exactMatch chembl.compound CHEMBL247378 \n", + "497 skos:exactMatch chembl.compound CHEMBL131244 \n", + "904 skos:exactMatch chembl.compound CHEMBL4525134 \n", + "927 skos:exactMatch chembl.compound CHEMBL2229602 \n", + "443 skos:exactMatch chembl.compound CHEMBL3039346 \n", + "574 skos:exactMatch chembl.compound CHEMBL11471 \n", "... ... ... ... \n", - "4662 skos:exactMatch chembl.compound CHEMBL4594241 \n", - "3310 skos:exactMatch chembl.compound CHEMBL1200797 \n", - "3774 skos:exactMatch chembl.compound CHEMBL2108508 \n", - "3775 skos:exactMatch chembl.compound CHEMBL1201563 \n", - "3776 skos:exactMatch chembl.compound CHEMBL1201564 \n", + "852 skos:exactMatch chembl.compound CHEMBL1200797 \n", + "1182 skos:exactMatch chembl.compound CHEMBL1201563 \n", + "1183 skos:exactMatch chembl.compound CHEMBL1201564 \n", + "1892 skos:exactMatch chembl.compound CHEMBL4594241 \n", + "1181 skos:exactMatch chembl.compound CHEMBL2108508 \n", "\n", - " target_name type confidence source \n", - "4377 Indolicidin lexical 0.777778 notebook \n", - "3524 ALRESTATIN lexical 0.777778 notebook \n", - "1765 PJ34 lexical 0.777778 notebook \n", - "1766 PJ34 lexical 0.777778 notebook \n", - "3520 7-NITROINDAZOLE lexical 0.777778 notebook \n", - "... ... ... ... ... \n", - "4662 TECHNETIUM TC 99M SESTAMIBI lexical 0.723974 notebook \n", - "3310 TECHNETIUM TC 99M SUCCIMER lexical 0.723974 notebook \n", - "3774 INTERFERON ALFA-2A lexical 0.723974 notebook \n", - "3775 INTERFERON BETA-1B lexical 0.723974 notebook \n", - "3776 INTERFERON GAMMA-1B lexical 0.723974 notebook \n", + " target_name type confidence source \n", + "497 1-AMINOCYCLOBUTANE CARBOXYLIC ACID lexical 0.777778 notebook \n", + "904 M2 lexical 0.777778 notebook \n", + "927 2-endo-hydroxy-1,8-cineole lexical 0.777778 notebook \n", + "443 XYLOCARPUS A lexical 0.777778 notebook \n", + "574 L-NIO lexical 0.777778 notebook \n", + "... ... ... ... ... \n", + "852 TECHNETIUM TC 99M SUCCIMER lexical 0.723974 notebook \n", + "1182 INTERFERON BETA-1B lexical 0.723974 notebook \n", + "1183 INTERFERON GAMMA-1B lexical 0.723974 notebook \n", + "1892 TECHNETIUM TC 99M SESTAMIBI lexical 0.723974 notebook \n", + "1181 INTERFERON ALFA-2A lexical 0.723974 notebook \n", "\n", - "[4720 rows x 10 columns]" + "[1921 rows x 10 columns]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/drug-indications.ipynb b/notebooks/drug-indications.ipynb index b896a7d..a9840fa 100644 --- a/notebooks/drug-indications.ipynb +++ b/notebooks/drug-indications.ipynb @@ -21,10 +21,10 @@ "import time\n", "\n", "import pandas as pd\n", - "from tqdm.auto import tqdm\n", "from IPython.display import Markdown\n", + "from tqdm.auto import tqdm\n", "\n", - "from chembl_downloader import queries, query, latest, get_date" + "from chembl_downloader import get_date, latest, queries, query" ] }, { diff --git a/notebooks/refresh-static-data.ipynb b/notebooks/refresh-static-data.ipynb index 703a871..66cc0c9 100644 --- a/notebooks/refresh-static-data.ipynb +++ b/notebooks/refresh-static-data.ipynb @@ -25,10 +25,10 @@ "import matplotlib_inline\n", "import pandas as pd\n", "import seaborn as sns\n", - "from scipy import stats\n", "from rdkit import Chem\n", - "from tqdm.auto import tqdm\n", + "from scipy import stats\n", "from sklearn.decomposition import PCA\n", + "from tqdm.auto import tqdm\n", "\n", "import chembl_downloader\n", "import chembl_downloader.contrib" diff --git a/tox.ini b/tox.ini index 45ae714..7f3ee78 100644 --- a/tox.ini +++ b/tox.ini @@ -54,10 +54,12 @@ commands = coverage erase deps = black[jupyter] isort + nbqa skip_install = true commands = black . isort . + nbqa isort notebooks/ description = Run linters. [testenv:doclint]