diff --git a/notebooks/5.0-transcriptomics_specific_analysis.ipynb b/notebooks/5.0-transcriptomics_specific_analysis.ipynb deleted file mode 100644 index 9a8c5d8..0000000 --- a/notebooks/5.0-transcriptomics_specific_analysis.ipynb +++ /dev/null @@ -1,309 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "environmental-guarantee", - "metadata": {}, - "source": [ - "# Transcriptomics-specific Analysis\n", - "\n", - "This notebook contains creation of all chemical-disease pairs in our subgraphs based on specific-transcriptomic data" - ] - }, - { - "cell_type": "markdown", - "id": "living-sessions", - "metadata": {}, - "source": [ - "# Pre-requirements\n", - "\n", - "1. Installation of drug2ways\n", - "1. Running of earlier notebook (notebook 2,3, and 5) " - ] - }, - { - "cell_type": "markdown", - "id": "terminal-enlargement", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "documentary-privacy", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os\n", - "import json\n", - "import logging\n", - "from tqdm import tqdm\n", - "from itertools import product\n", - "from networkx import DiGraph\n", - "\n", - "from utils import (get_paths, filter_dataset, \n", - " get_transcriptomic_paths, create_graph_from_df,\n", - " get_path_count, DATA_DIR, KG_DATA_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "stuck-delicious", - "metadata": {}, - "outputs": [], - "source": [ - "logger = logging.getLogger(__name__)\n", - "logging.getLogger('drug2ways').setLevel(logging.CRITICAL)\n", - "logging.basicConfig(level=logging.CRITICAL)" - ] - }, - { - "cell_type": "markdown", - "id": "sonic-animal", - "metadata": {}, - "source": [ - "# Load dataset-generated network dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "widespread-insured", - "metadata": {}, - "outputs": [], - "source": [ - "openbiolink_df = pd.read_csv(\n", - " os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'),\n", - " sep='\\t'\n", - ")\n", - "\n", - "custom_df = pd.read_csv(\n", - " os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'), \n", - " sep='\\t'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "familiar-north", - "metadata": {}, - "source": [ - "# Load disease datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "accomplished-daughter", - "metadata": {}, - "outputs": [], - "source": [ - "with open(os.path.join(DATA_DIR, 'transcriptomics', 'geo_harmonized_expression.json')) as file:\n", - " geo_dict = json.load(file)\n", - "\n", - "with open(os.path.join(DATA_DIR, 'transcriptomics', 'target_harmonized_expression.json')) as file2:\n", - " open_target_dict = json.load(file2)" - ] - }, - { - "cell_type": "markdown", - "id": "twelve-diesel", - "metadata": {}, - "source": [ - "# Filterting disease dataset based on network" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "conditional-object", - "metadata": {}, - "outputs": [], - "source": [ - "# GEO\n", - "geo_openbio = filter_dataset(dataset=geo_dict, graph_df=openbiolink_df)\n", - "geo_custom = filter_dataset(dataset=geo_dict, graph_df=custom_df)\n", - "\n", - "geo_dict = {'openbio': geo_openbio, 'custom': geo_custom}\n", - "\n", - "# OpenTarget\n", - "target_openbio = filter_dataset(dataset=open_target_dict, graph_df=openbiolink_df)\n", - "target_custom = filter_dataset(dataset=open_target_dict, graph_df=custom_df)\n", - "\n", - "open_target_dict = {'openbio': target_openbio, 'custom': target_custom}" - ] - }, - { - "cell_type": "markdown", - "id": "broke-somalia", - "metadata": {}, - "source": [ - "# Load clinical and drug-indication data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "settled-sustainability", - "metadata": {}, - "outputs": [], - "source": [ - "with open(os.path.join(DATA_DIR, 'gold-standard', 'filtered-clinical-pairs.json')) as file:\n", - " clinical_pair_dict = json.load(file).keys()" - ] - }, - { - "cell_type": "markdown", - "id": "environmental-gross", - "metadata": {}, - "source": [ - "# Creating information dict for each chemical-disease pair" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "atmospheric-compiler", - "metadata": {}, - "outputs": [], - "source": [ - "MAP = {\n", - " 'target': open_target_dict,\n", - " 'geo': geo_dict,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "marked-tower", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "def main():\n", - " df = pd.DataFrame(columns=[\n", - " 'source',\n", - " 'target',\n", - " 'number_of_paths',\n", - " 'number_of_concordant_paths',\n", - " 'in_clinical_trial',\n", - " 'number_of_concordant_activatory_paths',\n", - " 'number_of_concordant_inhibitory_paths',\n", - " 'subgraph_size',\n", - " 'number_of_unique_nodes',\n", - "# 'lmax',\n", - " 'subgraph_name',\n", - " ])\n", - "\n", - " if not os.path.exists(os.path.join(DATA_DIR, 'concordant_paths')):\n", - " os.mkdir(os.path.join(DATA_DIR, 'concordant_paths'))\n", - "\n", - "\n", - " for gname in ['openbio', 'custom']:\n", - "\n", - " if gname == 'openbio':\n", - " kg_df = openbiolink_df\n", - " else:\n", - " kg_df = custom_df\n", - "\n", - " graph_copy = create_graph_from_df(kg_df)\n", - " graph = graph_copy.copy()\n", - "\n", - " # Get protein nodes\n", - " protein_nodes = list(node for node in graph.nodes() if 'ncbigene' in node)\n", - "\n", - " paths = get_protein_paths(\n", - " graph=graph,\n", - " protein_list=protein_nodes,\n", - " lmax=4, # Just accepts one value, change in Utils if needed.\n", - " )\n", - "\n", - " for lmax, path_list in tqdm(paths.items(), desc='Calculating concordance'):\n", - " for p in path_list:\n", - " if len(p['paths']) > 0:\n", - " \n", - " # Just get the nodes from the path\n", - " tmp_paths = []\n", - "\n", - " for v, l in p['paths'].items():\n", - " pth = []\n", - " for k in l:\n", - " if k in ['-|', '->']:\n", - " continue\n", - " else:\n", - " pth.append(k)\n", - " tmp_paths.append(pth)\n", - "\n", - " protein = p['source']\n", - " disease = p['target']\n", - "\n", - " for disease_dict in MAP:\n", - " if disease not in disease_dict[gname]:\n", - " continue\n", - "\n", - " results = get_transcriptomic_paths(\n", - " directed_graph=graph,\n", - " source=chemical,\n", - " target=disease,\n", - " all_paths=tmp_paths,\n", - " disease_dict=disease_dict[gname][disease],\n", - " clinical_pair_dict=clinical_pair_dict,\n", - " )\n", - "\n", - "\n", - " concordant_num = len(results[i])\n", - " if concordant_num != 0:\n", - " activated_paths, inhibited_paths = get_path_count(\n", - " directed_graph=graph,\n", - " filtered_paths=results[i]\n", - " )\n", - "\n", - " new_results = {\n", - " 'source': results['source'],\n", - " 'target': results['target'],\n", - " 'number_of_paths': results['number_of_paths'],\n", - " 'number_of_concordant_paths': concordant_num,\n", - " 'in_clinical_trial': results['in_clinical_trial'],\n", - " 'number_of_concordant_activatory_paths': activated_paths,\n", - " 'number_of_concordant_inhibitory_paths': inhibited_paths,\n", - " 'subgraph_size': results['subgraph_size'],\n", - " 'number_of_unique_nodes': results['number_of_unique_nodes'],\n", - " 'subgraph_name': gname, # TODO: If changing lmax to range, return lmax here too\n", - " }\n", - "\n", - " tmp_df = pd.DataFrame(new_results, index=[0])\n", - " df = pd.concat([df, tmp_df], ignore_index=True)\n", - "\n", - " n_file_path = os.path.join(DATA_DIR, 'concordant_paths', f'{graph_name}-{i}.tsv')\n", - " val.to_csv(n_file_path, sep='\\t', index=False) " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}